2095 lines
75 KiB
Python
2095 lines
75 KiB
Python
"""
|
|
The :mod:`sklearn.model_selection._split` module includes classes and
|
|
functions to split the data based on a preset strategy.
|
|
"""
|
|
|
|
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>,
|
|
# Gael Varoquaux <gael.varoquaux@normalesup.org>,
|
|
# Olivier Grisel <olivier.grisel@ensta.org>
|
|
# Raghav RV <rvraghav93@gmail.com>
|
|
# License: BSD 3 clause
|
|
|
|
|
|
from __future__ import print_function
|
|
from __future__ import division
|
|
|
|
import warnings
|
|
from itertools import chain, combinations
|
|
from math import ceil, floor
|
|
import numbers
|
|
from abc import ABCMeta, abstractmethod
|
|
|
|
import numpy as np
|
|
|
|
from ..utils import indexable, check_random_state, safe_indexing
|
|
from ..utils.validation import _num_samples, column_or_1d
|
|
from ..utils.validation import check_array
|
|
from ..utils.multiclass import type_of_target
|
|
from ..externals.six import with_metaclass
|
|
from ..externals.six.moves import zip
|
|
from ..utils.fixes import signature, comb
|
|
from ..utils.fixes import _Iterable as Iterable
|
|
from ..base import _pprint
|
|
|
|
__all__ = ['BaseCrossValidator',
|
|
'KFold',
|
|
'GroupKFold',
|
|
'LeaveOneGroupOut',
|
|
'LeaveOneOut',
|
|
'LeavePGroupsOut',
|
|
'LeavePOut',
|
|
'RepeatedStratifiedKFold',
|
|
'RepeatedKFold',
|
|
'ShuffleSplit',
|
|
'GroupShuffleSplit',
|
|
'StratifiedKFold',
|
|
'StratifiedShuffleSplit',
|
|
'PredefinedSplit',
|
|
'train_test_split',
|
|
'check_cv']
|
|
|
|
|
|
class BaseCrossValidator(with_metaclass(ABCMeta)):
|
|
"""Base class for all cross-validators
|
|
|
|
Implementations must define `_iter_test_masks` or `_iter_test_indices`.
|
|
"""
|
|
|
|
def __init__(self):
|
|
# We need this for the build_repr to work properly in py2.7
|
|
# see #6304
|
|
pass
|
|
|
|
def split(self, X, y=None, groups=None):
|
|
"""Generate indices to split data into training and test set.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape (n_samples, n_features)
|
|
Training data, where n_samples is the number of samples
|
|
and n_features is the number of features.
|
|
|
|
y : array-like, of length n_samples
|
|
The target variable for supervised learning problems.
|
|
|
|
groups : array-like, with shape (n_samples,), optional
|
|
Group labels for the samples used while splitting the dataset into
|
|
train/test set.
|
|
|
|
Returns
|
|
-------
|
|
train : ndarray
|
|
The training set indices for that split.
|
|
|
|
test : ndarray
|
|
The testing set indices for that split.
|
|
|
|
Notes
|
|
-----
|
|
Randomized CV splitters may return different results for each call of
|
|
split. You can make the results identical by setting ``random_state``
|
|
to an integer.
|
|
"""
|
|
X, y, groups = indexable(X, y, groups)
|
|
indices = np.arange(_num_samples(X))
|
|
for test_index in self._iter_test_masks(X, y, groups):
|
|
train_index = indices[np.logical_not(test_index)]
|
|
test_index = indices[test_index]
|
|
yield train_index, test_index
|
|
|
|
# Since subclasses must implement either _iter_test_masks or
|
|
# _iter_test_indices, neither can be abstract.
|
|
def _iter_test_masks(self, X=None, y=None, groups=None):
|
|
"""Generates boolean masks corresponding to test sets.
|
|
|
|
By default, delegates to _iter_test_indices(X, y, groups)
|
|
"""
|
|
for test_index in self._iter_test_indices(X, y, groups):
|
|
test_mask = np.zeros(_num_samples(X), dtype=np.bool)
|
|
test_mask[test_index] = True
|
|
yield test_mask
|
|
|
|
def _iter_test_indices(self, X=None, y=None, groups=None):
|
|
"""Generates integer indices corresponding to test sets."""
|
|
raise NotImplementedError
|
|
|
|
@abstractmethod
|
|
def get_n_splits(self, X=None, y=None, groups=None):
|
|
"""Returns the number of splitting iterations in the cross-validator"""
|
|
|
|
def __repr__(self):
|
|
return _build_repr(self)
|
|
|
|
|
|
class LeaveOneOut(BaseCrossValidator):
|
|
"""Leave-One-Out cross-validator
|
|
|
|
Provides train/test indices to split data in train/test sets. Each
|
|
sample is used once as a test set (singleton) while the remaining
|
|
samples form the training set.
|
|
|
|
Note: ``LeaveOneOut()`` is equivalent to ``KFold(n_splits=n)`` and
|
|
``LeavePOut(p=1)`` where ``n`` is the number of samples.
|
|
|
|
Due to the high number of test sets (which is the same as the
|
|
number of samples) this cross-validation method can be very costly.
|
|
For large datasets one should favor :class:`KFold`, :class:`ShuffleSplit`
|
|
or :class:`StratifiedKFold`.
|
|
|
|
Read more in the :ref:`User Guide <cross_validation>`.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.model_selection import LeaveOneOut
|
|
>>> X = np.array([[1, 2], [3, 4]])
|
|
>>> y = np.array([1, 2])
|
|
>>> loo = LeaveOneOut()
|
|
>>> loo.get_n_splits(X)
|
|
2
|
|
>>> print(loo)
|
|
LeaveOneOut()
|
|
>>> for train_index, test_index in loo.split(X):
|
|
... print("TRAIN:", train_index, "TEST:", test_index)
|
|
... X_train, X_test = X[train_index], X[test_index]
|
|
... y_train, y_test = y[train_index], y[test_index]
|
|
... print(X_train, X_test, y_train, y_test)
|
|
TRAIN: [1] TEST: [0]
|
|
[[3 4]] [[1 2]] [2] [1]
|
|
TRAIN: [0] TEST: [1]
|
|
[[1 2]] [[3 4]] [1] [2]
|
|
|
|
See also
|
|
--------
|
|
LeaveOneGroupOut
|
|
For splitting the data according to explicit, domain-specific
|
|
stratification of the dataset.
|
|
|
|
GroupKFold: K-fold iterator variant with non-overlapping groups.
|
|
"""
|
|
|
|
def _iter_test_indices(self, X, y=None, groups=None):
|
|
return range(_num_samples(X))
|
|
|
|
def get_n_splits(self, X, y=None, groups=None):
|
|
"""Returns the number of splitting iterations in the cross-validator
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape (n_samples, n_features)
|
|
Training data, where n_samples is the number of samples
|
|
and n_features is the number of features.
|
|
|
|
y : object
|
|
Always ignored, exists for compatibility.
|
|
|
|
groups : object
|
|
Always ignored, exists for compatibility.
|
|
|
|
Returns
|
|
-------
|
|
n_splits : int
|
|
Returns the number of splitting iterations in the cross-validator.
|
|
"""
|
|
if X is None:
|
|
raise ValueError("The 'X' parameter should not be None.")
|
|
return _num_samples(X)
|
|
|
|
|
|
class LeavePOut(BaseCrossValidator):
|
|
"""Leave-P-Out cross-validator
|
|
|
|
Provides train/test indices to split data in train/test sets. This results
|
|
in testing on all distinct samples of size p, while the remaining n - p
|
|
samples form the training set in each iteration.
|
|
|
|
Note: ``LeavePOut(p)`` is NOT equivalent to
|
|
``KFold(n_splits=n_samples // p)`` which creates non-overlapping test sets.
|
|
|
|
Due to the high number of iterations which grows combinatorically with the
|
|
number of samples this cross-validation method can be very costly. For
|
|
large datasets one should favor :class:`KFold`, :class:`StratifiedKFold`
|
|
or :class:`ShuffleSplit`.
|
|
|
|
Read more in the :ref:`User Guide <cross_validation>`.
|
|
|
|
Parameters
|
|
----------
|
|
p : int
|
|
Size of the test sets.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.model_selection import LeavePOut
|
|
>>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
|
|
>>> y = np.array([1, 2, 3, 4])
|
|
>>> lpo = LeavePOut(2)
|
|
>>> lpo.get_n_splits(X)
|
|
6
|
|
>>> print(lpo)
|
|
LeavePOut(p=2)
|
|
>>> for train_index, test_index in lpo.split(X):
|
|
... print("TRAIN:", train_index, "TEST:", test_index)
|
|
... X_train, X_test = X[train_index], X[test_index]
|
|
... y_train, y_test = y[train_index], y[test_index]
|
|
TRAIN: [2 3] TEST: [0 1]
|
|
TRAIN: [1 3] TEST: [0 2]
|
|
TRAIN: [1 2] TEST: [0 3]
|
|
TRAIN: [0 3] TEST: [1 2]
|
|
TRAIN: [0 2] TEST: [1 3]
|
|
TRAIN: [0 1] TEST: [2 3]
|
|
"""
|
|
|
|
def __init__(self, p):
|
|
self.p = p
|
|
|
|
def _iter_test_indices(self, X, y=None, groups=None):
|
|
for combination in combinations(range(_num_samples(X)), self.p):
|
|
yield np.array(combination)
|
|
|
|
def get_n_splits(self, X, y=None, groups=None):
|
|
"""Returns the number of splitting iterations in the cross-validator
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape (n_samples, n_features)
|
|
Training data, where n_samples is the number of samples
|
|
and n_features is the number of features.
|
|
|
|
y : object
|
|
Always ignored, exists for compatibility.
|
|
|
|
groups : object
|
|
Always ignored, exists for compatibility.
|
|
"""
|
|
if X is None:
|
|
raise ValueError("The 'X' parameter should not be None.")
|
|
return int(comb(_num_samples(X), self.p, exact=True))
|
|
|
|
|
|
class _BaseKFold(with_metaclass(ABCMeta, BaseCrossValidator)):
|
|
"""Base class for KFold, GroupKFold, and StratifiedKFold"""
|
|
|
|
@abstractmethod
|
|
def __init__(self, n_splits, shuffle, random_state):
|
|
if not isinstance(n_splits, numbers.Integral):
|
|
raise ValueError('The number of folds must be of Integral type. '
|
|
'%s of type %s was passed.'
|
|
% (n_splits, type(n_splits)))
|
|
n_splits = int(n_splits)
|
|
|
|
if n_splits <= 1:
|
|
raise ValueError(
|
|
"k-fold cross-validation requires at least one"
|
|
" train/test split by setting n_splits=2 or more,"
|
|
" got n_splits={0}.".format(n_splits))
|
|
|
|
if not isinstance(shuffle, bool):
|
|
raise TypeError("shuffle must be True or False;"
|
|
" got {0}".format(shuffle))
|
|
|
|
self.n_splits = n_splits
|
|
self.shuffle = shuffle
|
|
self.random_state = random_state
|
|
|
|
def split(self, X, y=None, groups=None):
|
|
"""Generate indices to split data into training and test set.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape (n_samples, n_features)
|
|
Training data, where n_samples is the number of samples
|
|
and n_features is the number of features.
|
|
|
|
y : array-like, shape (n_samples,)
|
|
The target variable for supervised learning problems.
|
|
|
|
groups : array-like, with shape (n_samples,), optional
|
|
Group labels for the samples used while splitting the dataset into
|
|
train/test set.
|
|
|
|
Returns
|
|
-------
|
|
train : ndarray
|
|
The training set indices for that split.
|
|
|
|
test : ndarray
|
|
The testing set indices for that split.
|
|
|
|
Notes
|
|
-----
|
|
Randomized CV splitters may return different results for each call of
|
|
split. You can make the results identical by setting ``random_state``
|
|
to an integer.
|
|
"""
|
|
X, y, groups = indexable(X, y, groups)
|
|
n_samples = _num_samples(X)
|
|
if self.n_splits > n_samples:
|
|
raise ValueError(
|
|
("Cannot have number of splits n_splits={0} greater"
|
|
" than the number of samples: {1}.").format(self.n_splits,
|
|
n_samples))
|
|
|
|
for train, test in super(_BaseKFold, self).split(X, y, groups):
|
|
yield train, test
|
|
|
|
def get_n_splits(self, X=None, y=None, groups=None):
|
|
"""Returns the number of splitting iterations in the cross-validator
|
|
|
|
Parameters
|
|
----------
|
|
X : object
|
|
Always ignored, exists for compatibility.
|
|
|
|
y : object
|
|
Always ignored, exists for compatibility.
|
|
|
|
groups : object
|
|
Always ignored, exists for compatibility.
|
|
|
|
Returns
|
|
-------
|
|
n_splits : int
|
|
Returns the number of splitting iterations in the cross-validator.
|
|
"""
|
|
return self.n_splits
|
|
|
|
|
|
class KFold(_BaseKFold):
|
|
"""K-Folds cross-validator
|
|
|
|
Provides train/test indices to split data in train/test sets. Split
|
|
dataset into k consecutive folds (without shuffling by default).
|
|
|
|
Each fold is then used once as a validation while the k - 1 remaining
|
|
folds form the training set.
|
|
|
|
Read more in the :ref:`User Guide <cross_validation>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_splits : int, default=3
|
|
Number of folds. Must be at least 2.
|
|
|
|
shuffle : boolean, optional
|
|
Whether to shuffle the data before splitting into batches.
|
|
|
|
random_state : int, RandomState instance or None, optional, default=None
|
|
If int, random_state is the seed used by the random number generator;
|
|
If RandomState instance, random_state is the random number generator;
|
|
If None, the random number generator is the RandomState instance used
|
|
by `np.random`. Used when ``shuffle`` == True.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.model_selection import KFold
|
|
>>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
|
|
>>> y = np.array([1, 2, 3, 4])
|
|
>>> kf = KFold(n_splits=2)
|
|
>>> kf.get_n_splits(X)
|
|
2
|
|
>>> print(kf) # doctest: +NORMALIZE_WHITESPACE
|
|
KFold(n_splits=2, random_state=None, shuffle=False)
|
|
>>> for train_index, test_index in kf.split(X):
|
|
... print("TRAIN:", train_index, "TEST:", test_index)
|
|
... X_train, X_test = X[train_index], X[test_index]
|
|
... y_train, y_test = y[train_index], y[test_index]
|
|
TRAIN: [2 3] TEST: [0 1]
|
|
TRAIN: [0 1] TEST: [2 3]
|
|
|
|
Notes
|
|
-----
|
|
The first ``n_samples % n_splits`` folds have size
|
|
``n_samples // n_splits + 1``, other folds have size
|
|
``n_samples // n_splits``, where ``n_samples`` is the number of samples.
|
|
|
|
See also
|
|
--------
|
|
StratifiedKFold
|
|
Takes group information into account to avoid building folds with
|
|
imbalanced class distributions (for binary or multiclass
|
|
classification tasks).
|
|
|
|
GroupKFold: K-fold iterator variant with non-overlapping groups.
|
|
|
|
RepeatedKFold: Repeats K-Fold n times.
|
|
"""
|
|
|
|
def __init__(self, n_splits=3, shuffle=False,
|
|
random_state=None):
|
|
super(KFold, self).__init__(n_splits, shuffle, random_state)
|
|
|
|
def _iter_test_indices(self, X, y=None, groups=None):
|
|
n_samples = _num_samples(X)
|
|
indices = np.arange(n_samples)
|
|
if self.shuffle:
|
|
check_random_state(self.random_state).shuffle(indices)
|
|
|
|
n_splits = self.n_splits
|
|
fold_sizes = (n_samples // n_splits) * np.ones(n_splits, dtype=np.int)
|
|
fold_sizes[:n_samples % n_splits] += 1
|
|
current = 0
|
|
for fold_size in fold_sizes:
|
|
start, stop = current, current + fold_size
|
|
yield indices[start:stop]
|
|
current = stop
|
|
|
|
|
|
class GroupKFold(_BaseKFold):
|
|
"""K-fold iterator variant with non-overlapping groups.
|
|
|
|
The same group will not appear in two different folds (the number of
|
|
distinct groups has to be at least equal to the number of folds).
|
|
|
|
The folds are approximately balanced in the sense that the number of
|
|
distinct groups is approximately the same in each fold.
|
|
|
|
Parameters
|
|
----------
|
|
n_splits : int, default=3
|
|
Number of folds. Must be at least 2.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.model_selection import GroupKFold
|
|
>>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
|
|
>>> y = np.array([1, 2, 3, 4])
|
|
>>> groups = np.array([0, 0, 2, 2])
|
|
>>> group_kfold = GroupKFold(n_splits=2)
|
|
>>> group_kfold.get_n_splits(X, y, groups)
|
|
2
|
|
>>> print(group_kfold)
|
|
GroupKFold(n_splits=2)
|
|
>>> for train_index, test_index in group_kfold.split(X, y, groups):
|
|
... print("TRAIN:", train_index, "TEST:", test_index)
|
|
... X_train, X_test = X[train_index], X[test_index]
|
|
... y_train, y_test = y[train_index], y[test_index]
|
|
... print(X_train, X_test, y_train, y_test)
|
|
...
|
|
TRAIN: [0 1] TEST: [2 3]
|
|
[[1 2]
|
|
[3 4]] [[5 6]
|
|
[7 8]] [1 2] [3 4]
|
|
TRAIN: [2 3] TEST: [0 1]
|
|
[[5 6]
|
|
[7 8]] [[1 2]
|
|
[3 4]] [3 4] [1 2]
|
|
|
|
See also
|
|
--------
|
|
LeaveOneGroupOut
|
|
For splitting the data according to explicit domain-specific
|
|
stratification of the dataset.
|
|
"""
|
|
def __init__(self, n_splits=3):
|
|
super(GroupKFold, self).__init__(n_splits, shuffle=False,
|
|
random_state=None)
|
|
|
|
def _iter_test_indices(self, X, y, groups):
|
|
if groups is None:
|
|
raise ValueError("The 'groups' parameter should not be None.")
|
|
groups = check_array(groups, ensure_2d=False, dtype=None)
|
|
|
|
unique_groups, groups = np.unique(groups, return_inverse=True)
|
|
n_groups = len(unique_groups)
|
|
|
|
if self.n_splits > n_groups:
|
|
raise ValueError("Cannot have number of splits n_splits=%d greater"
|
|
" than the number of groups: %d."
|
|
% (self.n_splits, n_groups))
|
|
|
|
# Weight groups by their number of occurrences
|
|
n_samples_per_group = np.bincount(groups)
|
|
|
|
# Distribute the most frequent groups first
|
|
indices = np.argsort(n_samples_per_group)[::-1]
|
|
n_samples_per_group = n_samples_per_group[indices]
|
|
|
|
# Total weight of each fold
|
|
n_samples_per_fold = np.zeros(self.n_splits)
|
|
|
|
# Mapping from group index to fold index
|
|
group_to_fold = np.zeros(len(unique_groups))
|
|
|
|
# Distribute samples by adding the largest weight to the lightest fold
|
|
for group_index, weight in enumerate(n_samples_per_group):
|
|
lightest_fold = np.argmin(n_samples_per_fold)
|
|
n_samples_per_fold[lightest_fold] += weight
|
|
group_to_fold[indices[group_index]] = lightest_fold
|
|
|
|
indices = group_to_fold[groups]
|
|
|
|
for f in range(self.n_splits):
|
|
yield np.where(indices == f)[0]
|
|
|
|
|
|
class StratifiedKFold(_BaseKFold):
|
|
"""Stratified K-Folds cross-validator
|
|
|
|
Provides train/test indices to split data in train/test sets.
|
|
|
|
This cross-validation object is a variation of KFold that returns
|
|
stratified folds. The folds are made by preserving the percentage of
|
|
samples for each class.
|
|
|
|
Read more in the :ref:`User Guide <cross_validation>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_splits : int, default=3
|
|
Number of folds. Must be at least 2.
|
|
|
|
shuffle : boolean, optional
|
|
Whether to shuffle each stratification of the data before splitting
|
|
into batches.
|
|
|
|
random_state : int, RandomState instance or None, optional, default=None
|
|
If int, random_state is the seed used by the random number generator;
|
|
If RandomState instance, random_state is the random number generator;
|
|
If None, the random number generator is the RandomState instance used
|
|
by `np.random`. Used when ``shuffle`` == True.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.model_selection import StratifiedKFold
|
|
>>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
|
|
>>> y = np.array([0, 0, 1, 1])
|
|
>>> skf = StratifiedKFold(n_splits=2)
|
|
>>> skf.get_n_splits(X, y)
|
|
2
|
|
>>> print(skf) # doctest: +NORMALIZE_WHITESPACE
|
|
StratifiedKFold(n_splits=2, random_state=None, shuffle=False)
|
|
>>> for train_index, test_index in skf.split(X, y):
|
|
... print("TRAIN:", train_index, "TEST:", test_index)
|
|
... X_train, X_test = X[train_index], X[test_index]
|
|
... y_train, y_test = y[train_index], y[test_index]
|
|
TRAIN: [1 3] TEST: [0 2]
|
|
TRAIN: [0 2] TEST: [1 3]
|
|
|
|
Notes
|
|
-----
|
|
All the folds have size ``trunc(n_samples / n_splits)``, the last one has
|
|
the complementary.
|
|
|
|
See also
|
|
--------
|
|
RepeatedStratifiedKFold: Repeats Stratified K-Fold n times.
|
|
"""
|
|
|
|
def __init__(self, n_splits=3, shuffle=False, random_state=None):
|
|
super(StratifiedKFold, self).__init__(n_splits, shuffle, random_state)
|
|
|
|
def _make_test_folds(self, X, y=None):
|
|
rng = self.random_state
|
|
y = np.asarray(y)
|
|
type_of_target_y = type_of_target(y)
|
|
allowed_target_types = ('binary', 'multiclass')
|
|
if type_of_target_y not in allowed_target_types:
|
|
raise ValueError(
|
|
'Supported target types are: {}. Got {!r} instead.'.format(
|
|
allowed_target_types, type_of_target_y))
|
|
|
|
y = column_or_1d(y)
|
|
n_samples = y.shape[0]
|
|
unique_y, y_inversed = np.unique(y, return_inverse=True)
|
|
y_counts = np.bincount(y_inversed)
|
|
min_groups = np.min(y_counts)
|
|
if np.all(self.n_splits > y_counts):
|
|
raise ValueError("n_splits=%d cannot be greater than the"
|
|
" number of members in each class."
|
|
% (self.n_splits))
|
|
if self.n_splits > min_groups:
|
|
warnings.warn(("The least populated class in y has only %d"
|
|
" members, which is too few. The minimum"
|
|
" number of members in any class cannot"
|
|
" be less than n_splits=%d."
|
|
% (min_groups, self.n_splits)), Warning)
|
|
|
|
# pre-assign each sample to a test fold index using individual KFold
|
|
# splitting strategies for each class so as to respect the balance of
|
|
# classes
|
|
# NOTE: Passing the data corresponding to ith class say X[y==class_i]
|
|
# will break when the data is not 100% stratifiable for all classes.
|
|
# So we pass np.zeroes(max(c, n_splits)) as data to the KFold
|
|
per_cls_cvs = [
|
|
KFold(self.n_splits, shuffle=self.shuffle,
|
|
random_state=rng).split(np.zeros(max(count, self.n_splits)))
|
|
for count in y_counts]
|
|
|
|
test_folds = np.zeros(n_samples, dtype=np.int)
|
|
for test_fold_indices, per_cls_splits in enumerate(zip(*per_cls_cvs)):
|
|
for cls, (_, test_split) in zip(unique_y, per_cls_splits):
|
|
cls_test_folds = test_folds[y == cls]
|
|
# the test split can be too big because we used
|
|
# KFold(...).split(X[:max(c, n_splits)]) when data is not 100%
|
|
# stratifiable for all the classes
|
|
# (we use a warning instead of raising an exception)
|
|
# If this is the case, let's trim it:
|
|
test_split = test_split[test_split < len(cls_test_folds)]
|
|
cls_test_folds[test_split] = test_fold_indices
|
|
test_folds[y == cls] = cls_test_folds
|
|
|
|
return test_folds
|
|
|
|
def _iter_test_masks(self, X, y=None, groups=None):
|
|
test_folds = self._make_test_folds(X, y)
|
|
for i in range(self.n_splits):
|
|
yield test_folds == i
|
|
|
|
def split(self, X, y, groups=None):
|
|
"""Generate indices to split data into training and test set.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape (n_samples, n_features)
|
|
Training data, where n_samples is the number of samples
|
|
and n_features is the number of features.
|
|
|
|
Note that providing ``y`` is sufficient to generate the splits and
|
|
hence ``np.zeros(n_samples)`` may be used as a placeholder for
|
|
``X`` instead of actual training data.
|
|
|
|
y : array-like, shape (n_samples,)
|
|
The target variable for supervised learning problems.
|
|
Stratification is done based on the y labels.
|
|
|
|
groups : object
|
|
Always ignored, exists for compatibility.
|
|
|
|
Returns
|
|
-------
|
|
train : ndarray
|
|
The training set indices for that split.
|
|
|
|
test : ndarray
|
|
The testing set indices for that split.
|
|
|
|
Notes
|
|
-----
|
|
Randomized CV splitters may return different results for each call of
|
|
split. You can make the results identical by setting ``random_state``
|
|
to an integer.
|
|
"""
|
|
y = check_array(y, ensure_2d=False, dtype=None)
|
|
return super(StratifiedKFold, self).split(X, y, groups)
|
|
|
|
|
|
class TimeSeriesSplit(_BaseKFold):
|
|
"""Time Series cross-validator
|
|
|
|
Provides train/test indices to split time series data samples
|
|
that are observed at fixed time intervals, in train/test sets.
|
|
In each split, test indices must be higher than before, and thus shuffling
|
|
in cross validator is inappropriate.
|
|
|
|
This cross-validation object is a variation of :class:`KFold`.
|
|
In the kth split, it returns first k folds as train set and the
|
|
(k+1)th fold as test set.
|
|
|
|
Note that unlike standard cross-validation methods, successive
|
|
training sets are supersets of those that come before them.
|
|
|
|
Read more in the :ref:`User Guide <cross_validation>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_splits : int, default=3
|
|
Number of splits. Must be at least 1.
|
|
|
|
max_train_size : int, optional
|
|
Maximum size for a single training set.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.model_selection import TimeSeriesSplit
|
|
>>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
|
|
>>> y = np.array([1, 2, 3, 4])
|
|
>>> tscv = TimeSeriesSplit(n_splits=3)
|
|
>>> print(tscv) # doctest: +NORMALIZE_WHITESPACE
|
|
TimeSeriesSplit(max_train_size=None, n_splits=3)
|
|
>>> for train_index, test_index in tscv.split(X):
|
|
... print("TRAIN:", train_index, "TEST:", test_index)
|
|
... X_train, X_test = X[train_index], X[test_index]
|
|
... y_train, y_test = y[train_index], y[test_index]
|
|
TRAIN: [0] TEST: [1]
|
|
TRAIN: [0 1] TEST: [2]
|
|
TRAIN: [0 1 2] TEST: [3]
|
|
|
|
Notes
|
|
-----
|
|
The training set has size ``i * n_samples // (n_splits + 1)
|
|
+ n_samples % (n_splits + 1)`` in the ``i``th split,
|
|
with a test set of size ``n_samples//(n_splits + 1)``,
|
|
where ``n_samples`` is the number of samples.
|
|
"""
|
|
def __init__(self, n_splits=3, max_train_size=None):
|
|
super(TimeSeriesSplit, self).__init__(n_splits,
|
|
shuffle=False,
|
|
random_state=None)
|
|
self.max_train_size = max_train_size
|
|
|
|
def split(self, X, y=None, groups=None):
|
|
"""Generate indices to split data into training and test set.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape (n_samples, n_features)
|
|
Training data, where n_samples is the number of samples
|
|
and n_features is the number of features.
|
|
|
|
y : array-like, shape (n_samples,)
|
|
Always ignored, exists for compatibility.
|
|
|
|
groups : array-like, with shape (n_samples,), optional
|
|
Always ignored, exists for compatibility.
|
|
|
|
Returns
|
|
-------
|
|
train : ndarray
|
|
The training set indices for that split.
|
|
|
|
test : ndarray
|
|
The testing set indices for that split.
|
|
|
|
Notes
|
|
-----
|
|
Randomized CV splitters may return different results for each call of
|
|
split. You can make the results identical by setting ``random_state``
|
|
to an integer.
|
|
"""
|
|
X, y, groups = indexable(X, y, groups)
|
|
n_samples = _num_samples(X)
|
|
n_splits = self.n_splits
|
|
n_folds = n_splits + 1
|
|
if n_folds > n_samples:
|
|
raise ValueError(
|
|
("Cannot have number of folds ={0} greater"
|
|
" than the number of samples: {1}.").format(n_folds,
|
|
n_samples))
|
|
indices = np.arange(n_samples)
|
|
test_size = (n_samples // n_folds)
|
|
test_starts = range(test_size + n_samples % n_folds,
|
|
n_samples, test_size)
|
|
for test_start in test_starts:
|
|
if self.max_train_size and self.max_train_size < test_start:
|
|
yield (indices[test_start - self.max_train_size:test_start],
|
|
indices[test_start:test_start + test_size])
|
|
else:
|
|
yield (indices[:test_start],
|
|
indices[test_start:test_start + test_size])
|
|
|
|
|
|
class LeaveOneGroupOut(BaseCrossValidator):
|
|
"""Leave One Group Out cross-validator
|
|
|
|
Provides train/test indices to split data according to a third-party
|
|
provided group. This group information can be used to encode arbitrary
|
|
domain specific stratifications of the samples as integers.
|
|
|
|
For instance the groups could be the year of collection of the samples
|
|
and thus allow for cross-validation against time-based splits.
|
|
|
|
Read more in the :ref:`User Guide <cross_validation>`.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.model_selection import LeaveOneGroupOut
|
|
>>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
|
|
>>> y = np.array([1, 2, 1, 2])
|
|
>>> groups = np.array([1, 1, 2, 2])
|
|
>>> logo = LeaveOneGroupOut()
|
|
>>> logo.get_n_splits(X, y, groups)
|
|
2
|
|
>>> logo.get_n_splits(groups=groups) # 'groups' is always required
|
|
2
|
|
>>> print(logo)
|
|
LeaveOneGroupOut()
|
|
>>> for train_index, test_index in logo.split(X, y, groups):
|
|
... print("TRAIN:", train_index, "TEST:", test_index)
|
|
... X_train, X_test = X[train_index], X[test_index]
|
|
... y_train, y_test = y[train_index], y[test_index]
|
|
... print(X_train, X_test, y_train, y_test)
|
|
TRAIN: [2 3] TEST: [0 1]
|
|
[[5 6]
|
|
[7 8]] [[1 2]
|
|
[3 4]] [1 2] [1 2]
|
|
TRAIN: [0 1] TEST: [2 3]
|
|
[[1 2]
|
|
[3 4]] [[5 6]
|
|
[7 8]] [1 2] [1 2]
|
|
|
|
"""
|
|
|
|
def _iter_test_masks(self, X, y, groups):
|
|
if groups is None:
|
|
raise ValueError("The 'groups' parameter should not be None.")
|
|
# We make a copy of groups to avoid side-effects during iteration
|
|
groups = check_array(groups, copy=True, ensure_2d=False, dtype=None)
|
|
unique_groups = np.unique(groups)
|
|
if len(unique_groups) <= 1:
|
|
raise ValueError(
|
|
"The groups parameter contains fewer than 2 unique groups "
|
|
"(%s). LeaveOneGroupOut expects at least 2." % unique_groups)
|
|
for i in unique_groups:
|
|
yield groups == i
|
|
|
|
def get_n_splits(self, X=None, y=None, groups=None):
|
|
"""Returns the number of splitting iterations in the cross-validator
|
|
|
|
Parameters
|
|
----------
|
|
X : object, optional
|
|
Always ignored, exists for compatibility.
|
|
|
|
y : object, optional
|
|
Always ignored, exists for compatibility.
|
|
|
|
groups : array-like, with shape (n_samples,), optional
|
|
Group labels for the samples used while splitting the dataset into
|
|
train/test set. This 'groups' parameter must always be specified to
|
|
calculate the number of splits, though the other parameters can be
|
|
omitted.
|
|
|
|
Returns
|
|
-------
|
|
n_splits : int
|
|
Returns the number of splitting iterations in the cross-validator.
|
|
"""
|
|
if groups is None:
|
|
raise ValueError("The 'groups' parameter should not be None.")
|
|
groups = check_array(groups, ensure_2d=False, dtype=None)
|
|
return len(np.unique(groups))
|
|
|
|
|
|
class LeavePGroupsOut(BaseCrossValidator):
|
|
"""Leave P Group(s) Out cross-validator
|
|
|
|
Provides train/test indices to split data according to a third-party
|
|
provided group. This group information can be used to encode arbitrary
|
|
domain specific stratifications of the samples as integers.
|
|
|
|
For instance the groups could be the year of collection of the samples
|
|
and thus allow for cross-validation against time-based splits.
|
|
|
|
The difference between LeavePGroupsOut and LeaveOneGroupOut is that
|
|
the former builds the test sets with all the samples assigned to
|
|
``p`` different values of the groups while the latter uses samples
|
|
all assigned the same groups.
|
|
|
|
Read more in the :ref:`User Guide <cross_validation>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_groups : int
|
|
Number of groups (``p``) to leave out in the test split.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.model_selection import LeavePGroupsOut
|
|
>>> X = np.array([[1, 2], [3, 4], [5, 6]])
|
|
>>> y = np.array([1, 2, 1])
|
|
>>> groups = np.array([1, 2, 3])
|
|
>>> lpgo = LeavePGroupsOut(n_groups=2)
|
|
>>> lpgo.get_n_splits(X, y, groups)
|
|
3
|
|
>>> lpgo.get_n_splits(groups=groups) # 'groups' is always required
|
|
3
|
|
>>> print(lpgo)
|
|
LeavePGroupsOut(n_groups=2)
|
|
>>> for train_index, test_index in lpgo.split(X, y, groups):
|
|
... print("TRAIN:", train_index, "TEST:", test_index)
|
|
... X_train, X_test = X[train_index], X[test_index]
|
|
... y_train, y_test = y[train_index], y[test_index]
|
|
... print(X_train, X_test, y_train, y_test)
|
|
TRAIN: [2] TEST: [0 1]
|
|
[[5 6]] [[1 2]
|
|
[3 4]] [1] [1 2]
|
|
TRAIN: [1] TEST: [0 2]
|
|
[[3 4]] [[1 2]
|
|
[5 6]] [2] [1 1]
|
|
TRAIN: [0] TEST: [1 2]
|
|
[[1 2]] [[3 4]
|
|
[5 6]] [1] [2 1]
|
|
|
|
See also
|
|
--------
|
|
GroupKFold: K-fold iterator variant with non-overlapping groups.
|
|
"""
|
|
|
|
def __init__(self, n_groups):
|
|
self.n_groups = n_groups
|
|
|
|
def _iter_test_masks(self, X, y, groups):
|
|
if groups is None:
|
|
raise ValueError("The 'groups' parameter should not be None.")
|
|
groups = check_array(groups, copy=True, ensure_2d=False, dtype=None)
|
|
unique_groups = np.unique(groups)
|
|
if self.n_groups >= len(unique_groups):
|
|
raise ValueError(
|
|
"The groups parameter contains fewer than (or equal to) "
|
|
"n_groups (%d) numbers of unique groups (%s). LeavePGroupsOut "
|
|
"expects that at least n_groups + 1 (%d) unique groups be "
|
|
"present" % (self.n_groups, unique_groups, self.n_groups + 1))
|
|
combi = combinations(range(len(unique_groups)), self.n_groups)
|
|
for indices in combi:
|
|
test_index = np.zeros(_num_samples(X), dtype=np.bool)
|
|
for l in unique_groups[np.array(indices)]:
|
|
test_index[groups == l] = True
|
|
yield test_index
|
|
|
|
def get_n_splits(self, X=None, y=None, groups=None):
|
|
"""Returns the number of splitting iterations in the cross-validator
|
|
|
|
Parameters
|
|
----------
|
|
X : object, optional
|
|
Always ignored, exists for compatibility.
|
|
|
|
y : object, optional
|
|
Always ignored, exists for compatibility.
|
|
|
|
groups : array-like, with shape (n_samples,), optional
|
|
Group labels for the samples used while splitting the dataset into
|
|
train/test set. This 'groups' parameter must always be specified to
|
|
calculate the number of splits, though the other parameters can be
|
|
omitted.
|
|
|
|
Returns
|
|
-------
|
|
n_splits : int
|
|
Returns the number of splitting iterations in the cross-validator.
|
|
"""
|
|
if groups is None:
|
|
raise ValueError("The 'groups' parameter should not be None.")
|
|
groups = check_array(groups, ensure_2d=False, dtype=None)
|
|
return int(comb(len(np.unique(groups)), self.n_groups, exact=True))
|
|
|
|
|
|
class _RepeatedSplits(with_metaclass(ABCMeta)):
|
|
"""Repeated splits for an arbitrary randomized CV splitter.
|
|
|
|
Repeats splits for cross-validators n times with different randomization
|
|
in each repetition.
|
|
|
|
Parameters
|
|
----------
|
|
cv : callable
|
|
Cross-validator class.
|
|
|
|
n_repeats : int, default=10
|
|
Number of times cross-validator needs to be repeated.
|
|
|
|
random_state : int, RandomState instance or None, optional, default=None
|
|
If int, random_state is the seed used by the random number generator;
|
|
If RandomState instance, random_state is the random number generator;
|
|
If None, the random number generator is the RandomState instance used
|
|
by `np.random`.
|
|
|
|
**cvargs : additional params
|
|
Constructor parameters for cv. Must not contain random_state
|
|
and shuffle.
|
|
"""
|
|
def __init__(self, cv, n_repeats=10, random_state=None, **cvargs):
|
|
if not isinstance(n_repeats, (np.integer, numbers.Integral)):
|
|
raise ValueError("Number of repetitions must be of Integral type.")
|
|
|
|
if n_repeats <= 0:
|
|
raise ValueError("Number of repetitions must be greater than 0.")
|
|
|
|
if any(key in cvargs for key in ('random_state', 'shuffle')):
|
|
raise ValueError(
|
|
"cvargs must not contain random_state or shuffle.")
|
|
|
|
self.cv = cv
|
|
self.n_repeats = n_repeats
|
|
self.random_state = random_state
|
|
self.cvargs = cvargs
|
|
|
|
def split(self, X, y=None, groups=None):
|
|
"""Generates indices to split data into training and test set.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape (n_samples, n_features)
|
|
Training data, where n_samples is the number of samples
|
|
and n_features is the number of features.
|
|
|
|
y : array-like, of length n_samples
|
|
The target variable for supervised learning problems.
|
|
|
|
groups : array-like, with shape (n_samples,), optional
|
|
Group labels for the samples used while splitting the dataset into
|
|
train/test set.
|
|
|
|
Returns
|
|
-------
|
|
train : ndarray
|
|
The training set indices for that split.
|
|
|
|
test : ndarray
|
|
The testing set indices for that split.
|
|
"""
|
|
n_repeats = self.n_repeats
|
|
rng = check_random_state(self.random_state)
|
|
|
|
for idx in range(n_repeats):
|
|
cv = self.cv(random_state=rng, shuffle=True,
|
|
**self.cvargs)
|
|
for train_index, test_index in cv.split(X, y, groups):
|
|
yield train_index, test_index
|
|
|
|
def get_n_splits(self, X=None, y=None, groups=None):
|
|
"""Returns the number of splitting iterations in the cross-validator
|
|
|
|
Parameters
|
|
----------
|
|
X : object
|
|
Always ignored, exists for compatibility.
|
|
``np.zeros(n_samples)`` may be used as a placeholder.
|
|
|
|
y : object
|
|
Always ignored, exists for compatibility.
|
|
``np.zeros(n_samples)`` may be used as a placeholder.
|
|
|
|
groups : array-like, with shape (n_samples,), optional
|
|
Group labels for the samples used while splitting the dataset into
|
|
train/test set.
|
|
|
|
Returns
|
|
-------
|
|
n_splits : int
|
|
Returns the number of splitting iterations in the cross-validator.
|
|
"""
|
|
rng = check_random_state(self.random_state)
|
|
cv = self.cv(random_state=rng, shuffle=True,
|
|
**self.cvargs)
|
|
return cv.get_n_splits(X, y, groups) * self.n_repeats
|
|
|
|
|
|
class RepeatedKFold(_RepeatedSplits):
|
|
"""Repeated K-Fold cross validator.
|
|
|
|
Repeats K-Fold n times with different randomization in each repetition.
|
|
|
|
Read more in the :ref:`User Guide <cross_validation>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_splits : int, default=5
|
|
Number of folds. Must be at least 2.
|
|
|
|
n_repeats : int, default=10
|
|
Number of times cross-validator needs to be repeated.
|
|
|
|
random_state : int, RandomState instance or None, optional, default=None
|
|
If int, random_state is the seed used by the random number generator;
|
|
If RandomState instance, random_state is the random number generator;
|
|
If None, the random number generator is the RandomState instance used
|
|
by `np.random`.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.model_selection import RepeatedKFold
|
|
>>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
|
|
>>> y = np.array([0, 0, 1, 1])
|
|
>>> rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=2652124)
|
|
>>> for train_index, test_index in rkf.split(X):
|
|
... print("TRAIN:", train_index, "TEST:", test_index)
|
|
... X_train, X_test = X[train_index], X[test_index]
|
|
... y_train, y_test = y[train_index], y[test_index]
|
|
...
|
|
TRAIN: [0 1] TEST: [2 3]
|
|
TRAIN: [2 3] TEST: [0 1]
|
|
TRAIN: [1 2] TEST: [0 3]
|
|
TRAIN: [0 3] TEST: [1 2]
|
|
|
|
|
|
See also
|
|
--------
|
|
RepeatedStratifiedKFold: Repeates Stratified K-Fold n times.
|
|
"""
|
|
def __init__(self, n_splits=5, n_repeats=10, random_state=None):
|
|
super(RepeatedKFold, self).__init__(
|
|
KFold, n_repeats, random_state, n_splits=n_splits)
|
|
|
|
|
|
class RepeatedStratifiedKFold(_RepeatedSplits):
|
|
"""Repeated Stratified K-Fold cross validator.
|
|
|
|
Repeats Stratified K-Fold n times with different randomization in each
|
|
repetition.
|
|
|
|
Read more in the :ref:`User Guide <cross_validation>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_splits : int, default=5
|
|
Number of folds. Must be at least 2.
|
|
|
|
n_repeats : int, default=10
|
|
Number of times cross-validator needs to be repeated.
|
|
|
|
random_state : None, int or RandomState, default=None
|
|
Random state to be used to generate random state for each
|
|
repetition.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.model_selection import RepeatedStratifiedKFold
|
|
>>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
|
|
>>> y = np.array([0, 0, 1, 1])
|
|
>>> rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=2,
|
|
... random_state=36851234)
|
|
>>> for train_index, test_index in rskf.split(X, y):
|
|
... print("TRAIN:", train_index, "TEST:", test_index)
|
|
... X_train, X_test = X[train_index], X[test_index]
|
|
... y_train, y_test = y[train_index], y[test_index]
|
|
...
|
|
TRAIN: [1 2] TEST: [0 3]
|
|
TRAIN: [0 3] TEST: [1 2]
|
|
TRAIN: [1 3] TEST: [0 2]
|
|
TRAIN: [0 2] TEST: [1 3]
|
|
|
|
|
|
See also
|
|
--------
|
|
RepeatedKFold: Repeats K-Fold n times.
|
|
"""
|
|
def __init__(self, n_splits=5, n_repeats=10, random_state=None):
|
|
super(RepeatedStratifiedKFold, self).__init__(
|
|
StratifiedKFold, n_repeats, random_state, n_splits=n_splits)
|
|
|
|
|
|
class BaseShuffleSplit(with_metaclass(ABCMeta)):
|
|
"""Base class for ShuffleSplit and StratifiedShuffleSplit"""
|
|
|
|
def __init__(self, n_splits=10, test_size="default", train_size=None,
|
|
random_state=None):
|
|
_validate_shuffle_split_init(test_size, train_size)
|
|
self.n_splits = n_splits
|
|
self.test_size = test_size
|
|
self.train_size = train_size
|
|
self.random_state = random_state
|
|
|
|
def split(self, X, y=None, groups=None):
|
|
"""Generate indices to split data into training and test set.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape (n_samples, n_features)
|
|
Training data, where n_samples is the number of samples
|
|
and n_features is the number of features.
|
|
|
|
y : array-like, shape (n_samples,)
|
|
The target variable for supervised learning problems.
|
|
|
|
groups : array-like, with shape (n_samples,), optional
|
|
Group labels for the samples used while splitting the dataset into
|
|
train/test set.
|
|
|
|
Returns
|
|
-------
|
|
train : ndarray
|
|
The training set indices for that split.
|
|
|
|
test : ndarray
|
|
The testing set indices for that split.
|
|
|
|
Notes
|
|
-----
|
|
Randomized CV splitters may return different results for each call of
|
|
split. You can make the results identical by setting ``random_state``
|
|
to an integer.
|
|
"""
|
|
X, y, groups = indexable(X, y, groups)
|
|
for train, test in self._iter_indices(X, y, groups):
|
|
yield train, test
|
|
|
|
@abstractmethod
|
|
def _iter_indices(self, X, y=None, groups=None):
|
|
"""Generate (train, test) indices"""
|
|
|
|
def get_n_splits(self, X=None, y=None, groups=None):
|
|
"""Returns the number of splitting iterations in the cross-validator
|
|
|
|
Parameters
|
|
----------
|
|
X : object
|
|
Always ignored, exists for compatibility.
|
|
|
|
y : object
|
|
Always ignored, exists for compatibility.
|
|
|
|
groups : object
|
|
Always ignored, exists for compatibility.
|
|
|
|
Returns
|
|
-------
|
|
n_splits : int
|
|
Returns the number of splitting iterations in the cross-validator.
|
|
"""
|
|
return self.n_splits
|
|
|
|
def __repr__(self):
|
|
return _build_repr(self)
|
|
|
|
|
|
class ShuffleSplit(BaseShuffleSplit):
|
|
"""Random permutation cross-validator
|
|
|
|
Yields indices to split data into training and test sets.
|
|
|
|
Note: contrary to other cross-validation strategies, random splits
|
|
do not guarantee that all folds will be different, although this is
|
|
still very likely for sizeable datasets.
|
|
|
|
Read more in the :ref:`User Guide <cross_validation>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_splits : int, default 10
|
|
Number of re-shuffling & splitting iterations.
|
|
|
|
test_size : float, int, None, default=0.1
|
|
If float, should be between 0.0 and 1.0 and represent the proportion
|
|
of the dataset to include in the test split. If int, represents the
|
|
absolute number of test samples. If None, the value is set to the
|
|
complement of the train size. By default (the is parameter
|
|
unspecified), the value is set to 0.1.
|
|
The default will change in version 0.21. It will remain 0.1 only
|
|
if ``train_size`` is unspecified, otherwise it will complement
|
|
the specified ``train_size``.
|
|
|
|
train_size : float, int, or None, default=None
|
|
If float, should be between 0.0 and 1.0 and represent the
|
|
proportion of the dataset to include in the train split. If
|
|
int, represents the absolute number of train samples. If None,
|
|
the value is automatically set to the complement of the test size.
|
|
|
|
random_state : int, RandomState instance or None, optional (default=None)
|
|
If int, random_state is the seed used by the random number generator;
|
|
If RandomState instance, random_state is the random number generator;
|
|
If None, the random number generator is the RandomState instance used
|
|
by `np.random`.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.model_selection import ShuffleSplit
|
|
>>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
|
|
>>> y = np.array([1, 2, 1, 2])
|
|
>>> rs = ShuffleSplit(n_splits=3, test_size=.25, random_state=0)
|
|
>>> rs.get_n_splits(X)
|
|
3
|
|
>>> print(rs)
|
|
ShuffleSplit(n_splits=3, random_state=0, test_size=0.25, train_size=None)
|
|
>>> for train_index, test_index in rs.split(X):
|
|
... print("TRAIN:", train_index, "TEST:", test_index)
|
|
... # doctest: +ELLIPSIS
|
|
TRAIN: [3 1 0] TEST: [2]
|
|
TRAIN: [2 1 3] TEST: [0]
|
|
TRAIN: [0 2 1] TEST: [3]
|
|
>>> rs = ShuffleSplit(n_splits=3, train_size=0.5, test_size=.25,
|
|
... random_state=0)
|
|
>>> for train_index, test_index in rs.split(X):
|
|
... print("TRAIN:", train_index, "TEST:", test_index)
|
|
... # doctest: +ELLIPSIS
|
|
TRAIN: [3 1] TEST: [2]
|
|
TRAIN: [2 1] TEST: [0]
|
|
TRAIN: [0 2] TEST: [3]
|
|
"""
|
|
|
|
def _iter_indices(self, X, y=None, groups=None):
|
|
n_samples = _num_samples(X)
|
|
n_train, n_test = _validate_shuffle_split(n_samples,
|
|
self.test_size,
|
|
self.train_size)
|
|
rng = check_random_state(self.random_state)
|
|
for i in range(self.n_splits):
|
|
# random partition
|
|
permutation = rng.permutation(n_samples)
|
|
ind_test = permutation[:n_test]
|
|
ind_train = permutation[n_test:(n_test + n_train)]
|
|
yield ind_train, ind_test
|
|
|
|
|
|
class GroupShuffleSplit(ShuffleSplit):
|
|
'''Shuffle-Group(s)-Out cross-validation iterator
|
|
|
|
Provides randomized train/test indices to split data according to a
|
|
third-party provided group. This group information can be used to encode
|
|
arbitrary domain specific stratifications of the samples as integers.
|
|
|
|
For instance the groups could be the year of collection of the samples
|
|
and thus allow for cross-validation against time-based splits.
|
|
|
|
The difference between LeavePGroupsOut and GroupShuffleSplit is that
|
|
the former generates splits using all subsets of size ``p`` unique groups,
|
|
whereas GroupShuffleSplit generates a user-determined number of random
|
|
test splits, each with a user-determined fraction of unique groups.
|
|
|
|
For example, a less computationally intensive alternative to
|
|
``LeavePGroupsOut(p=10)`` would be
|
|
``GroupShuffleSplit(test_size=10, n_splits=100)``.
|
|
|
|
Note: The parameters ``test_size`` and ``train_size`` refer to groups, and
|
|
not to samples, as in ShuffleSplit.
|
|
|
|
|
|
Parameters
|
|
----------
|
|
n_splits : int (default 5)
|
|
Number of re-shuffling & splitting iterations.
|
|
|
|
test_size : float, int, None, optional
|
|
If float, should be between 0.0 and 1.0 and represent the proportion
|
|
of the dataset to include in the test split. If int, represents the
|
|
absolute number of test samples. If None, the value is set to the
|
|
complement of the train size. By default, the value is set to 0.2.
|
|
The default will change in version 0.21. It will remain 0.2 only
|
|
if ``train_size`` is unspecified, otherwise it will complement
|
|
the specified ``train_size``.
|
|
|
|
train_size : float, int, or None, default is None
|
|
If float, should be between 0.0 and 1.0 and represent the
|
|
proportion of the groups to include in the train split. If
|
|
int, represents the absolute number of train groups. If None,
|
|
the value is automatically set to the complement of the test size.
|
|
|
|
random_state : int, RandomState instance or None, optional (default=None)
|
|
If int, random_state is the seed used by the random number generator;
|
|
If RandomState instance, random_state is the random number generator;
|
|
If None, the random number generator is the RandomState instance used
|
|
by `np.random`.
|
|
|
|
'''
|
|
|
|
def __init__(self, n_splits=5, test_size="default", train_size=None,
|
|
random_state=None):
|
|
if test_size == "default":
|
|
if train_size is not None:
|
|
warnings.warn("From version 0.21, test_size will always "
|
|
"complement train_size unless both "
|
|
"are specified.",
|
|
FutureWarning)
|
|
test_size = 0.2
|
|
|
|
super(GroupShuffleSplit, self).__init__(
|
|
n_splits=n_splits,
|
|
test_size=test_size,
|
|
train_size=train_size,
|
|
random_state=random_state)
|
|
|
|
def _iter_indices(self, X, y, groups):
|
|
if groups is None:
|
|
raise ValueError("The 'groups' parameter should not be None.")
|
|
groups = check_array(groups, ensure_2d=False, dtype=None)
|
|
classes, group_indices = np.unique(groups, return_inverse=True)
|
|
for group_train, group_test in super(
|
|
GroupShuffleSplit, self)._iter_indices(X=classes):
|
|
# these are the indices of classes in the partition
|
|
# invert them into data indices
|
|
|
|
train = np.flatnonzero(np.in1d(group_indices, group_train))
|
|
test = np.flatnonzero(np.in1d(group_indices, group_test))
|
|
|
|
yield train, test
|
|
|
|
|
|
def _approximate_mode(class_counts, n_draws, rng):
|
|
"""Computes approximate mode of multivariate hypergeometric.
|
|
|
|
This is an approximation to the mode of the multivariate
|
|
hypergeometric given by class_counts and n_draws.
|
|
It shouldn't be off by more than one.
|
|
|
|
It is the mostly likely outcome of drawing n_draws many
|
|
samples from the population given by class_counts.
|
|
|
|
Parameters
|
|
----------
|
|
class_counts : ndarray of int
|
|
Population per class.
|
|
n_draws : int
|
|
Number of draws (samples to draw) from the overall population.
|
|
rng : random state
|
|
Used to break ties.
|
|
|
|
Returns
|
|
-------
|
|
sampled_classes : ndarray of int
|
|
Number of samples drawn from each class.
|
|
np.sum(sampled_classes) == n_draws
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.model_selection._split import _approximate_mode
|
|
>>> _approximate_mode(class_counts=np.array([4, 2]), n_draws=3, rng=0)
|
|
array([2, 1])
|
|
>>> _approximate_mode(class_counts=np.array([5, 2]), n_draws=4, rng=0)
|
|
array([3, 1])
|
|
>>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),
|
|
... n_draws=2, rng=0)
|
|
array([0, 1, 1, 0])
|
|
>>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),
|
|
... n_draws=2, rng=42)
|
|
array([1, 1, 0, 0])
|
|
"""
|
|
rng = check_random_state(rng)
|
|
# this computes a bad approximation to the mode of the
|
|
# multivariate hypergeometric given by class_counts and n_draws
|
|
continuous = n_draws * class_counts / class_counts.sum()
|
|
# floored means we don't overshoot n_samples, but probably undershoot
|
|
floored = np.floor(continuous)
|
|
# we add samples according to how much "left over" probability
|
|
# they had, until we arrive at n_samples
|
|
need_to_add = int(n_draws - floored.sum())
|
|
if need_to_add > 0:
|
|
remainder = continuous - floored
|
|
values = np.sort(np.unique(remainder))[::-1]
|
|
# add according to remainder, but break ties
|
|
# randomly to avoid biases
|
|
for value in values:
|
|
inds, = np.where(remainder == value)
|
|
# if we need_to_add less than what's in inds
|
|
# we draw randomly from them.
|
|
# if we need to add more, we add them all and
|
|
# go to the next value
|
|
add_now = min(len(inds), need_to_add)
|
|
inds = rng.choice(inds, size=add_now, replace=False)
|
|
floored[inds] += 1
|
|
need_to_add -= add_now
|
|
if need_to_add == 0:
|
|
break
|
|
return floored.astype(np.int)
|
|
|
|
|
|
class StratifiedShuffleSplit(BaseShuffleSplit):
|
|
"""Stratified ShuffleSplit cross-validator
|
|
|
|
Provides train/test indices to split data in train/test sets.
|
|
|
|
This cross-validation object is a merge of StratifiedKFold and
|
|
ShuffleSplit, which returns stratified randomized folds. The folds
|
|
are made by preserving the percentage of samples for each class.
|
|
|
|
Note: like the ShuffleSplit strategy, stratified random splits
|
|
do not guarantee that all folds will be different, although this is
|
|
still very likely for sizeable datasets.
|
|
|
|
Read more in the :ref:`User Guide <cross_validation>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_splits : int, default 10
|
|
Number of re-shuffling & splitting iterations.
|
|
|
|
test_size : float, int, None, optional
|
|
If float, should be between 0.0 and 1.0 and represent the proportion
|
|
of the dataset to include in the test split. If int, represents the
|
|
absolute number of test samples. If None, the value is set to the
|
|
complement of the train size. By default, the value is set to 0.1.
|
|
The default will change in version 0.21. It will remain 0.1 only
|
|
if ``train_size`` is unspecified, otherwise it will complement
|
|
the specified ``train_size``.
|
|
|
|
train_size : float, int, or None, default is None
|
|
If float, should be between 0.0 and 1.0 and represent the
|
|
proportion of the dataset to include in the train split. If
|
|
int, represents the absolute number of train samples. If None,
|
|
the value is automatically set to the complement of the test size.
|
|
|
|
random_state : int, RandomState instance or None, optional (default=None)
|
|
If int, random_state is the seed used by the random number generator;
|
|
If RandomState instance, random_state is the random number generator;
|
|
If None, the random number generator is the RandomState instance used
|
|
by `np.random`.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.model_selection import StratifiedShuffleSplit
|
|
>>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
|
|
>>> y = np.array([0, 0, 1, 1])
|
|
>>> sss = StratifiedShuffleSplit(n_splits=3, test_size=0.5, random_state=0)
|
|
>>> sss.get_n_splits(X, y)
|
|
3
|
|
>>> print(sss) # doctest: +ELLIPSIS
|
|
StratifiedShuffleSplit(n_splits=3, random_state=0, ...)
|
|
>>> for train_index, test_index in sss.split(X, y):
|
|
... print("TRAIN:", train_index, "TEST:", test_index)
|
|
... X_train, X_test = X[train_index], X[test_index]
|
|
... y_train, y_test = y[train_index], y[test_index]
|
|
TRAIN: [1 2] TEST: [3 0]
|
|
TRAIN: [0 2] TEST: [1 3]
|
|
TRAIN: [0 2] TEST: [3 1]
|
|
"""
|
|
|
|
def __init__(self, n_splits=10, test_size="default", train_size=None,
|
|
random_state=None):
|
|
super(StratifiedShuffleSplit, self).__init__(
|
|
n_splits, test_size, train_size, random_state)
|
|
|
|
def _iter_indices(self, X, y, groups=None):
|
|
n_samples = _num_samples(X)
|
|
y = check_array(y, ensure_2d=False, dtype=None)
|
|
n_train, n_test = _validate_shuffle_split(n_samples, self.test_size,
|
|
self.train_size)
|
|
|
|
if y.ndim == 2:
|
|
# for multi-label y, map each distinct row to a string repr
|
|
# using join because str(row) uses an ellipsis if len(row) > 1000
|
|
y = np.array([' '.join(row.astype('str')) for row in y])
|
|
|
|
classes, y_indices = np.unique(y, return_inverse=True)
|
|
n_classes = classes.shape[0]
|
|
|
|
class_counts = np.bincount(y_indices)
|
|
if np.min(class_counts) < 2:
|
|
raise ValueError("The least populated class in y has only 1"
|
|
" member, which is too few. The minimum"
|
|
" number of groups for any class cannot"
|
|
" be less than 2.")
|
|
|
|
if n_train < n_classes:
|
|
raise ValueError('The train_size = %d should be greater or '
|
|
'equal to the number of classes = %d' %
|
|
(n_train, n_classes))
|
|
if n_test < n_classes:
|
|
raise ValueError('The test_size = %d should be greater or '
|
|
'equal to the number of classes = %d' %
|
|
(n_test, n_classes))
|
|
|
|
# Find the sorted list of instances for each class:
|
|
# (np.unique above performs a sort, so code is O(n logn) already)
|
|
class_indices = np.split(np.argsort(y_indices, kind='mergesort'),
|
|
np.cumsum(class_counts)[:-1])
|
|
|
|
rng = check_random_state(self.random_state)
|
|
|
|
for _ in range(self.n_splits):
|
|
# if there are ties in the class-counts, we want
|
|
# to make sure to break them anew in each iteration
|
|
n_i = _approximate_mode(class_counts, n_train, rng)
|
|
class_counts_remaining = class_counts - n_i
|
|
t_i = _approximate_mode(class_counts_remaining, n_test, rng)
|
|
|
|
train = []
|
|
test = []
|
|
|
|
for i in range(n_classes):
|
|
permutation = rng.permutation(class_counts[i])
|
|
perm_indices_class_i = class_indices[i].take(permutation,
|
|
mode='clip')
|
|
|
|
train.extend(perm_indices_class_i[:n_i[i]])
|
|
test.extend(perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]])
|
|
|
|
train = rng.permutation(train)
|
|
test = rng.permutation(test)
|
|
|
|
yield train, test
|
|
|
|
def split(self, X, y, groups=None):
|
|
"""Generate indices to split data into training and test set.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape (n_samples, n_features)
|
|
Training data, where n_samples is the number of samples
|
|
and n_features is the number of features.
|
|
|
|
Note that providing ``y`` is sufficient to generate the splits and
|
|
hence ``np.zeros(n_samples)`` may be used as a placeholder for
|
|
``X`` instead of actual training data.
|
|
|
|
y : array-like, shape (n_samples,)
|
|
The target variable for supervised learning problems.
|
|
Stratification is done based on the y labels.
|
|
|
|
groups : object
|
|
Always ignored, exists for compatibility.
|
|
|
|
Returns
|
|
-------
|
|
train : ndarray
|
|
The training set indices for that split.
|
|
|
|
test : ndarray
|
|
The testing set indices for that split.
|
|
|
|
Notes
|
|
-----
|
|
Randomized CV splitters may return different results for each call of
|
|
split. You can make the results identical by setting ``random_state``
|
|
to an integer.
|
|
"""
|
|
y = check_array(y, ensure_2d=False, dtype=None)
|
|
return super(StratifiedShuffleSplit, self).split(X, y, groups)
|
|
|
|
|
|
def _validate_shuffle_split_init(test_size, train_size):
|
|
"""Validation helper to check the test_size and train_size at init
|
|
|
|
NOTE This does not take into account the number of samples which is known
|
|
only at split
|
|
"""
|
|
if test_size == "default":
|
|
if train_size is not None:
|
|
warnings.warn("From version 0.21, test_size will always "
|
|
"complement train_size unless both "
|
|
"are specified.",
|
|
FutureWarning)
|
|
test_size = 0.1
|
|
|
|
if test_size is None and train_size is None:
|
|
raise ValueError('test_size and train_size can not both be None')
|
|
|
|
if test_size is not None:
|
|
if np.asarray(test_size).dtype.kind == 'f':
|
|
if test_size >= 1.:
|
|
raise ValueError(
|
|
'test_size=%f should be smaller '
|
|
'than 1.0 or be an integer' % test_size)
|
|
elif np.asarray(test_size).dtype.kind != 'i':
|
|
# int values are checked during split based on the input
|
|
raise ValueError("Invalid value for test_size: %r" % test_size)
|
|
|
|
if train_size is not None:
|
|
if np.asarray(train_size).dtype.kind == 'f':
|
|
if train_size >= 1.:
|
|
raise ValueError("train_size=%f should be smaller "
|
|
"than 1.0 or be an integer" % train_size)
|
|
elif (np.asarray(test_size).dtype.kind == 'f' and
|
|
(train_size + test_size) > 1.):
|
|
raise ValueError('The sum of test_size and train_size = %f, '
|
|
'should be smaller than 1.0. Reduce '
|
|
'test_size and/or train_size.' %
|
|
(train_size + test_size))
|
|
elif np.asarray(train_size).dtype.kind != 'i':
|
|
# int values are checked during split based on the input
|
|
raise ValueError("Invalid value for train_size: %r" % train_size)
|
|
|
|
|
|
def _validate_shuffle_split(n_samples, test_size, train_size):
|
|
"""
|
|
Validation helper to check if the test/test sizes are meaningful wrt to the
|
|
size of the data (n_samples)
|
|
"""
|
|
if (test_size is not None and
|
|
np.asarray(test_size).dtype.kind == 'i' and
|
|
test_size >= n_samples):
|
|
raise ValueError('test_size=%d should be smaller than the number of '
|
|
'samples %d' % (test_size, n_samples))
|
|
|
|
if (train_size is not None and
|
|
np.asarray(train_size).dtype.kind == 'i' and
|
|
train_size >= n_samples):
|
|
raise ValueError("train_size=%d should be smaller than the number of"
|
|
" samples %d" % (train_size, n_samples))
|
|
|
|
if test_size == "default":
|
|
test_size = 0.1
|
|
|
|
if np.asarray(test_size).dtype.kind == 'f':
|
|
n_test = ceil(test_size * n_samples)
|
|
elif np.asarray(test_size).dtype.kind == 'i':
|
|
n_test = float(test_size)
|
|
|
|
if train_size is None:
|
|
n_train = n_samples - n_test
|
|
elif np.asarray(train_size).dtype.kind == 'f':
|
|
n_train = floor(train_size * n_samples)
|
|
else:
|
|
n_train = float(train_size)
|
|
|
|
if test_size is None:
|
|
n_test = n_samples - n_train
|
|
|
|
if n_train + n_test > n_samples:
|
|
raise ValueError('The sum of train_size and test_size = %d, '
|
|
'should be smaller than the number of '
|
|
'samples %d. Reduce test_size and/or '
|
|
'train_size.' % (n_train + n_test, n_samples))
|
|
|
|
return int(n_train), int(n_test)
|
|
|
|
|
|
class PredefinedSplit(BaseCrossValidator):
|
|
"""Predefined split cross-validator
|
|
|
|
Provides train/test indices to split data into train/test sets using a
|
|
predefined scheme specified by the user with the ``test_fold`` parameter.
|
|
|
|
Read more in the :ref:`User Guide <cross_validation>`.
|
|
|
|
Parameters
|
|
----------
|
|
test_fold : array-like, shape (n_samples,)
|
|
The entry ``test_fold[i]`` represents the index of the test set that
|
|
sample ``i`` belongs to. It is possible to exclude sample ``i`` from
|
|
any test set (i.e. include sample ``i`` in every training set) by
|
|
setting ``test_fold[i]`` equal to -1.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.model_selection import PredefinedSplit
|
|
>>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
|
|
>>> y = np.array([0, 0, 1, 1])
|
|
>>> test_fold = [0, 1, -1, 1]
|
|
>>> ps = PredefinedSplit(test_fold)
|
|
>>> ps.get_n_splits()
|
|
2
|
|
>>> print(ps) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
|
|
PredefinedSplit(test_fold=array([ 0, 1, -1, 1]))
|
|
>>> for train_index, test_index in ps.split():
|
|
... print("TRAIN:", train_index, "TEST:", test_index)
|
|
... X_train, X_test = X[train_index], X[test_index]
|
|
... y_train, y_test = y[train_index], y[test_index]
|
|
TRAIN: [1 2 3] TEST: [0]
|
|
TRAIN: [0 2] TEST: [1 3]
|
|
"""
|
|
|
|
def __init__(self, test_fold):
|
|
self.test_fold = np.array(test_fold, dtype=np.int)
|
|
self.test_fold = column_or_1d(self.test_fold)
|
|
self.unique_folds = np.unique(self.test_fold)
|
|
self.unique_folds = self.unique_folds[self.unique_folds != -1]
|
|
|
|
def split(self, X=None, y=None, groups=None):
|
|
"""Generate indices to split data into training and test set.
|
|
|
|
Parameters
|
|
----------
|
|
X : object
|
|
Always ignored, exists for compatibility.
|
|
|
|
y : object
|
|
Always ignored, exists for compatibility.
|
|
|
|
groups : object
|
|
Always ignored, exists for compatibility.
|
|
|
|
Returns
|
|
-------
|
|
train : ndarray
|
|
The training set indices for that split.
|
|
|
|
test : ndarray
|
|
The testing set indices for that split.
|
|
"""
|
|
ind = np.arange(len(self.test_fold))
|
|
for test_index in self._iter_test_masks():
|
|
train_index = ind[np.logical_not(test_index)]
|
|
test_index = ind[test_index]
|
|
yield train_index, test_index
|
|
|
|
def _iter_test_masks(self):
|
|
"""Generates boolean masks corresponding to test sets."""
|
|
for f in self.unique_folds:
|
|
test_index = np.where(self.test_fold == f)[0]
|
|
test_mask = np.zeros(len(self.test_fold), dtype=np.bool)
|
|
test_mask[test_index] = True
|
|
yield test_mask
|
|
|
|
def get_n_splits(self, X=None, y=None, groups=None):
|
|
"""Returns the number of splitting iterations in the cross-validator
|
|
|
|
Parameters
|
|
----------
|
|
X : object
|
|
Always ignored, exists for compatibility.
|
|
|
|
y : object
|
|
Always ignored, exists for compatibility.
|
|
|
|
groups : object
|
|
Always ignored, exists for compatibility.
|
|
|
|
Returns
|
|
-------
|
|
n_splits : int
|
|
Returns the number of splitting iterations in the cross-validator.
|
|
"""
|
|
return len(self.unique_folds)
|
|
|
|
|
|
class _CVIterableWrapper(BaseCrossValidator):
|
|
"""Wrapper class for old style cv objects and iterables."""
|
|
def __init__(self, cv):
|
|
self.cv = list(cv)
|
|
|
|
def get_n_splits(self, X=None, y=None, groups=None):
|
|
"""Returns the number of splitting iterations in the cross-validator
|
|
|
|
Parameters
|
|
----------
|
|
X : object
|
|
Always ignored, exists for compatibility.
|
|
|
|
y : object
|
|
Always ignored, exists for compatibility.
|
|
|
|
groups : object
|
|
Always ignored, exists for compatibility.
|
|
|
|
Returns
|
|
-------
|
|
n_splits : int
|
|
Returns the number of splitting iterations in the cross-validator.
|
|
"""
|
|
return len(self.cv)
|
|
|
|
def split(self, X=None, y=None, groups=None):
|
|
"""Generate indices to split data into training and test set.
|
|
|
|
Parameters
|
|
----------
|
|
X : object
|
|
Always ignored, exists for compatibility.
|
|
|
|
y : object
|
|
Always ignored, exists for compatibility.
|
|
|
|
groups : object
|
|
Always ignored, exists for compatibility.
|
|
|
|
Returns
|
|
-------
|
|
train : ndarray
|
|
The training set indices for that split.
|
|
|
|
test : ndarray
|
|
The testing set indices for that split.
|
|
"""
|
|
for train, test in self.cv:
|
|
yield train, test
|
|
|
|
|
|
def check_cv(cv=3, y=None, classifier=False):
|
|
"""Input checker utility for building a cross-validator
|
|
|
|
Parameters
|
|
----------
|
|
cv : int, cross-validation generator or an iterable, optional
|
|
Determines the cross-validation splitting strategy.
|
|
Possible inputs for cv are:
|
|
|
|
- None, to use the default 3-fold cross-validation,
|
|
- integer, to specify the number of folds.
|
|
- An object to be used as a cross-validation generator.
|
|
- An iterable yielding train/test splits.
|
|
|
|
For integer/None inputs, if classifier is True and ``y`` is either
|
|
binary or multiclass, :class:`StratifiedKFold` is used. In all other
|
|
cases, :class:`KFold` is used.
|
|
|
|
Refer :ref:`User Guide <cross_validation>` for the various
|
|
cross-validation strategies that can be used here.
|
|
|
|
y : array-like, optional
|
|
The target variable for supervised learning problems.
|
|
|
|
classifier : boolean, optional, default False
|
|
Whether the task is a classification task, in which case
|
|
stratified KFold will be used.
|
|
|
|
Returns
|
|
-------
|
|
checked_cv : a cross-validator instance.
|
|
The return value is a cross-validator which generates the train/test
|
|
splits via the ``split`` method.
|
|
"""
|
|
if cv is None:
|
|
cv = 3
|
|
|
|
if isinstance(cv, numbers.Integral):
|
|
if (classifier and (y is not None) and
|
|
(type_of_target(y) in ('binary', 'multiclass'))):
|
|
return StratifiedKFold(cv)
|
|
else:
|
|
return KFold(cv)
|
|
|
|
if not hasattr(cv, 'split') or isinstance(cv, str):
|
|
if not isinstance(cv, Iterable) or isinstance(cv, str):
|
|
raise ValueError("Expected cv as an integer, cross-validation "
|
|
"object (from sklearn.model_selection) "
|
|
"or an iterable. Got %s." % cv)
|
|
return _CVIterableWrapper(cv)
|
|
|
|
return cv # New style cv objects are passed without any modification
|
|
|
|
|
|
def train_test_split(*arrays, **options):
|
|
"""Split arrays or matrices into random train and test subsets
|
|
|
|
Quick utility that wraps input validation and
|
|
``next(ShuffleSplit().split(X, y))`` and application to input data
|
|
into a single call for splitting (and optionally subsampling) data in a
|
|
oneliner.
|
|
|
|
Read more in the :ref:`User Guide <cross_validation>`.
|
|
|
|
Parameters
|
|
----------
|
|
*arrays : sequence of indexables with same length / shape[0]
|
|
Allowed inputs are lists, numpy arrays, scipy-sparse
|
|
matrices or pandas dataframes.
|
|
|
|
test_size : float, int, None, optional
|
|
If float, should be between 0.0 and 1.0 and represent the proportion
|
|
of the dataset to include in the test split. If int, represents the
|
|
absolute number of test samples. If None, the value is set to the
|
|
complement of the train size. By default, the value is set to 0.25.
|
|
The default will change in version 0.21. It will remain 0.25 only
|
|
if ``train_size`` is unspecified, otherwise it will complement
|
|
the specified ``train_size``.
|
|
|
|
train_size : float, int, or None, default None
|
|
If float, should be between 0.0 and 1.0 and represent the
|
|
proportion of the dataset to include in the train split. If
|
|
int, represents the absolute number of train samples. If None,
|
|
the value is automatically set to the complement of the test size.
|
|
|
|
random_state : int, RandomState instance or None, optional (default=None)
|
|
If int, random_state is the seed used by the random number generator;
|
|
If RandomState instance, random_state is the random number generator;
|
|
If None, the random number generator is the RandomState instance used
|
|
by `np.random`.
|
|
|
|
shuffle : boolean, optional (default=True)
|
|
Whether or not to shuffle the data before splitting. If shuffle=False
|
|
then stratify must be None.
|
|
|
|
stratify : array-like or None (default is None)
|
|
If not None, data is split in a stratified fashion, using this as
|
|
the class labels.
|
|
|
|
Returns
|
|
-------
|
|
splitting : list, length=2 * len(arrays)
|
|
List containing train-test split of inputs.
|
|
|
|
.. versionadded:: 0.16
|
|
If the input is sparse, the output will be a
|
|
``scipy.sparse.csr_matrix``. Else, output type is the same as the
|
|
input type.
|
|
|
|
Examples
|
|
--------
|
|
>>> import numpy as np
|
|
>>> from sklearn.model_selection import train_test_split
|
|
>>> X, y = np.arange(10).reshape((5, 2)), range(5)
|
|
>>> X
|
|
array([[0, 1],
|
|
[2, 3],
|
|
[4, 5],
|
|
[6, 7],
|
|
[8, 9]])
|
|
>>> list(y)
|
|
[0, 1, 2, 3, 4]
|
|
|
|
>>> X_train, X_test, y_train, y_test = train_test_split(
|
|
... X, y, test_size=0.33, random_state=42)
|
|
...
|
|
>>> X_train
|
|
array([[4, 5],
|
|
[0, 1],
|
|
[6, 7]])
|
|
>>> y_train
|
|
[2, 0, 3]
|
|
>>> X_test
|
|
array([[2, 3],
|
|
[8, 9]])
|
|
>>> y_test
|
|
[1, 4]
|
|
|
|
>>> train_test_split(y, shuffle=False)
|
|
[[0, 1, 2], [3, 4]]
|
|
|
|
"""
|
|
n_arrays = len(arrays)
|
|
if n_arrays == 0:
|
|
raise ValueError("At least one array required as input")
|
|
test_size = options.pop('test_size', 'default')
|
|
train_size = options.pop('train_size', None)
|
|
random_state = options.pop('random_state', None)
|
|
stratify = options.pop('stratify', None)
|
|
shuffle = options.pop('shuffle', True)
|
|
|
|
if options:
|
|
raise TypeError("Invalid parameters passed: %s" % str(options))
|
|
|
|
if test_size == 'default':
|
|
test_size = None
|
|
if train_size is not None:
|
|
warnings.warn("From version 0.21, test_size will always "
|
|
"complement train_size unless both "
|
|
"are specified.",
|
|
FutureWarning)
|
|
|
|
if test_size is None and train_size is None:
|
|
test_size = 0.25
|
|
|
|
arrays = indexable(*arrays)
|
|
|
|
if shuffle is False:
|
|
if stratify is not None:
|
|
raise ValueError(
|
|
"Stratified train/test split is not implemented for "
|
|
"shuffle=False")
|
|
|
|
n_samples = _num_samples(arrays[0])
|
|
n_train, n_test = _validate_shuffle_split(n_samples, test_size,
|
|
train_size)
|
|
|
|
train = np.arange(n_train)
|
|
test = np.arange(n_train, n_train + n_test)
|
|
|
|
else:
|
|
if stratify is not None:
|
|
CVClass = StratifiedShuffleSplit
|
|
else:
|
|
CVClass = ShuffleSplit
|
|
|
|
cv = CVClass(test_size=test_size,
|
|
train_size=train_size,
|
|
random_state=random_state)
|
|
|
|
train, test = next(cv.split(X=arrays[0], y=stratify))
|
|
|
|
return list(chain.from_iterable((safe_indexing(a, train),
|
|
safe_indexing(a, test)) for a in arrays))
|
|
|
|
|
|
train_test_split.__test__ = False # to avoid a pb with nosetests
|
|
|
|
|
|
def _build_repr(self):
|
|
# XXX This is copied from BaseEstimator's get_params
|
|
cls = self.__class__
|
|
init = getattr(cls.__init__, 'deprecated_original', cls.__init__)
|
|
# Ignore varargs, kw and default values and pop self
|
|
init_signature = signature(init)
|
|
# Consider the constructor parameters excluding 'self'
|
|
if init is object.__init__:
|
|
args = []
|
|
else:
|
|
args = sorted([p.name for p in init_signature.parameters.values()
|
|
if p.name != 'self' and p.kind != p.VAR_KEYWORD])
|
|
class_name = self.__class__.__name__
|
|
params = dict()
|
|
for key in args:
|
|
# We need deprecation warnings to always be on in order to
|
|
# catch deprecated param values.
|
|
# This is set in utils/__init__.py but it gets overwritten
|
|
# when running under python3 somehow.
|
|
warnings.simplefilter("always", DeprecationWarning)
|
|
try:
|
|
with warnings.catch_warnings(record=True) as w:
|
|
value = getattr(self, key, None)
|
|
if len(w) and w[0].category == DeprecationWarning:
|
|
# if the parameter is deprecated, don't show it
|
|
continue
|
|
finally:
|
|
warnings.filters.pop(0)
|
|
params[key] = value
|
|
|
|
return '%s(%s)' % (class_name, _pprint(params, offset=len(class_name)))
|