"""The :mod:`sklearn.model_selection._split` module includes classes andfunctions to split the data based on a preset strategy."""# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr># Gael Varoquaux <gael.varoquaux@normalesup.org># Olivier Grisel <olivier.grisel@ensta.org># Raghav RV <rvraghav93@gmail.com># Leandro Hermida <hermidal@cs.umd.edu># Rodion Martynov <marrodion@gmail.com># License: BSD 3 clauseimportnumbersimportwarningsfromabcimportABCMeta,abstractmethodfromcollectionsimportdefaultdictfromcollections.abcimportIterablefrominspectimportsignaturefromitertoolsimportchain,combinationsfrommathimportceil,floorimportnumpyasnpfromscipy.specialimportcombfrom..utilsimport(_approximate_mode,_safe_indexing,check_random_state,indexable,metadata_routing,)from..utils._param_validationimportInterval,RealNotInt,validate_paramsfrom..utils.metadata_routingimport_MetadataRequesterfrom..utils.multiclassimporttype_of_targetfrom..utils.validationimport_num_samples,check_array,column_or_1d__all__=["BaseCrossValidator","KFold","GroupKFold","LeaveOneGroupOut","LeaveOneOut","LeavePGroupsOut","LeavePOut","RepeatedStratifiedKFold","RepeatedKFold","ShuffleSplit","GroupShuffleSplit","StratifiedKFold","StratifiedGroupKFold","StratifiedShuffleSplit","PredefinedSplit","train_test_split","check_cv",]classGroupsConsumerMixin(_MetadataRequester):"""A Mixin to ``groups`` by default. This Mixin makes the object to request ``groups`` by default as ``True``. .. versionadded:: 1.3 """__metadata_request__split={"groups":True}classBaseCrossValidator(_MetadataRequester,metaclass=ABCMeta):"""Base class for all cross-validators. Implementations must define `_iter_test_masks` or `_iter_test_indices`. """# This indicates that by default CV splitters don't have a "groups" kwarg,# unless indicated by inheriting from ``GroupsConsumerMixin``.# This also prevents ``set_split_request`` to be generated for splitters# which don't support ``groups``.__metadata_request__split={"groups":metadata_routing.UNUSED}defsplit(self,X,y=None,groups=None):"""Generate indices to split data into training and test set. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data, where `n_samples` is the number of samples and `n_features` is the number of features. y : array-like of shape (n_samples,) The target variable for supervised learning problems. groups : array-like of shape (n_samples,), default=None Group labels for the samples used while splitting the dataset into train/test set. Yields ------ train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """X,y,groups=indexable(X,y,groups)indices=np.arange(_num_samples(X))fortest_indexinself._iter_test_masks(X,y,groups):train_index=indices[np.logical_not(test_index)]test_index=indices[test_index]yieldtrain_index,test_index# Since subclasses must implement either _iter_test_masks or# _iter_test_indices, neither can be abstract.def_iter_test_masks(self,X=None,y=None,groups=None):"""Generates boolean masks corresponding to test sets. By default, delegates to _iter_test_indices(X, y, groups) """fortest_indexinself._iter_test_indices(X,y,groups):test_mask=np.zeros(_num_samples(X),dtype=bool)test_mask[test_index]=Trueyieldtest_maskdef_iter_test_indices(self,X=None,y=None,groups=None):"""Generates integer indices corresponding to test sets."""raiseNotImplementedError@abstractmethoddefget_n_splits(self,X=None,y=None,groups=None):"""Returns the number of splitting iterations in the cross-validator."""def__repr__(self):return_build_repr(self)classLeaveOneOut(BaseCrossValidator):"""Leave-One-Out cross-validator. Provides train/test indices to split data in train/test sets. Each sample is used once as a test set (singleton) while the remaining samples form the training set. Note: ``LeaveOneOut()`` is equivalent to ``KFold(n_splits=n)`` and ``LeavePOut(p=1)`` where ``n`` is the number of samples. Due to the high number of test sets (which is the same as the number of samples) this cross-validation method can be very costly. For large datasets one should favor :class:`KFold`, :class:`ShuffleSplit` or :class:`StratifiedKFold`. Read more in the :ref:`User Guide <leave_one_out>`. Examples -------- >>> import numpy as np >>> from sklearn.model_selection import LeaveOneOut >>> X = np.array([[1, 2], [3, 4]]) >>> y = np.array([1, 2]) >>> loo = LeaveOneOut() >>> loo.get_n_splits(X) 2 >>> print(loo) LeaveOneOut() >>> for i, (train_index, test_index) in enumerate(loo.split(X)): ... print(f"Fold {i}:") ... print(f" Train: index={train_index}") ... print(f" Test: index={test_index}") Fold 0: Train: index=[1] Test: index=[0] Fold 1: Train: index=[0] Test: index=[1] See Also -------- LeaveOneGroupOut : For splitting the data according to explicit, domain-specific stratification of the dataset. GroupKFold : K-fold iterator variant with non-overlapping groups. """def_iter_test_indices(self,X,y=None,groups=None):n_samples=_num_samples(X)ifn_samples<=1:raiseValueError("Cannot perform LeaveOneOut with n_samples={}.".format(n_samples))returnrange(n_samples)defget_n_splits(self,X,y=None,groups=None):"""Returns the number of splitting iterations in the cross-validator. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data, where `n_samples` is the number of samples and `n_features` is the number of features. y : object Always ignored, exists for compatibility. groups : object Always ignored, exists for compatibility. Returns ------- n_splits : int Returns the number of splitting iterations in the cross-validator. """ifXisNone:raiseValueError("The 'X' parameter should not be None.")return_num_samples(X)classLeavePOut(BaseCrossValidator):"""Leave-P-Out cross-validator. Provides train/test indices to split data in train/test sets. This results in testing on all distinct samples of size p, while the remaining n - p samples form the training set in each iteration. Note: ``LeavePOut(p)`` is NOT equivalent to ``KFold(n_splits=n_samples // p)`` which creates non-overlapping test sets. Due to the high number of iterations which grows combinatorically with the number of samples this cross-validation method can be very costly. For large datasets one should favor :class:`KFold`, :class:`StratifiedKFold` or :class:`ShuffleSplit`. Read more in the :ref:`User Guide <leave_p_out>`. Parameters ---------- p : int Size of the test sets. Must be strictly less than the number of samples. Examples -------- >>> import numpy as np >>> from sklearn.model_selection import LeavePOut >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) >>> y = np.array([1, 2, 3, 4]) >>> lpo = LeavePOut(2) >>> lpo.get_n_splits(X) 6 >>> print(lpo) LeavePOut(p=2) >>> for i, (train_index, test_index) in enumerate(lpo.split(X)): ... print(f"Fold {i}:") ... print(f" Train: index={train_index}") ... print(f" Test: index={test_index}") Fold 0: Train: index=[2 3] Test: index=[0 1] Fold 1: Train: index=[1 3] Test: index=[0 2] Fold 2: Train: index=[1 2] Test: index=[0 3] Fold 3: Train: index=[0 3] Test: index=[1 2] Fold 4: Train: index=[0 2] Test: index=[1 3] Fold 5: Train: index=[0 1] Test: index=[2 3] """def__init__(self,p):self.p=pdef_iter_test_indices(self,X,y=None,groups=None):n_samples=_num_samples(X)ifn_samples<=self.p:raiseValueError("p={} must be strictly less than the number of samples={}".format(self.p,n_samples))forcombinationincombinations(range(n_samples),self.p):yieldnp.array(combination)defget_n_splits(self,X,y=None,groups=None):"""Returns the number of splitting iterations in the cross-validator. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data, where `n_samples` is the number of samples and `n_features` is the number of features. y : object Always ignored, exists for compatibility. groups : object Always ignored, exists for compatibility. """ifXisNone:raiseValueError("The 'X' parameter should not be None.")returnint(comb(_num_samples(X),self.p,exact=True))class_BaseKFold(BaseCrossValidator,metaclass=ABCMeta):"""Base class for K-Fold cross-validators and TimeSeriesSplit."""@abstractmethoddef__init__(self,n_splits,*,shuffle,random_state):ifnotisinstance(n_splits,numbers.Integral):raiseValueError("The number of folds must be of Integral type. ""%s of type %s was passed."%(n_splits,type(n_splits)))n_splits=int(n_splits)ifn_splits<=1:raiseValueError("k-fold cross-validation requires at least one"" train/test split by setting n_splits=2 or more,"" got n_splits={0}.".format(n_splits))ifnotisinstance(shuffle,bool):raiseTypeError("shuffle must be True or False; got {0}".format(shuffle))ifnotshuffleandrandom_stateisnotNone:# None is the defaultraiseValueError(("Setting a random_state has no effect since shuffle is ""False. You should leave ""random_state to its default (None), or set shuffle=True."),)self.n_splits=n_splitsself.shuffle=shuffleself.random_state=random_statedefsplit(self,X,y=None,groups=None):"""Generate indices to split data into training and test set. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data, where `n_samples` is the number of samples and `n_features` is the number of features. y : array-like of shape (n_samples,), default=None The target variable for supervised learning problems. groups : array-like of shape (n_samples,), default=None Group labels for the samples used while splitting the dataset into train/test set. Yields ------ train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """X,y,groups=indexable(X,y,groups)n_samples=_num_samples(X)ifself.n_splits>n_samples:raiseValueError(("Cannot have number of splits n_splits={0} greater"" than the number of samples: n_samples={1}.").format(self.n_splits,n_samples))fortrain,testinsuper().split(X,y,groups):yieldtrain,testdefget_n_splits(self,X=None,y=None,groups=None):"""Returns the number of splitting iterations in the cross-validator. Parameters ---------- X : object Always ignored, exists for compatibility. y : object Always ignored, exists for compatibility. groups : object Always ignored, exists for compatibility. Returns ------- n_splits : int Returns the number of splitting iterations in the cross-validator. """returnself.n_splitsclassKFold(_BaseKFold):"""K-Fold cross-validator. Provides train/test indices to split data in train/test sets. Split dataset into k consecutive folds (without shuffling by default). Each fold is then used once as a validation while the k - 1 remaining folds form the training set. Read more in the :ref:`User Guide <k_fold>`. For visualisation of cross-validation behaviour and comparison between common scikit-learn split methods refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` Parameters ---------- n_splits : int, default=5 Number of folds. Must be at least 2. .. versionchanged:: 0.22 ``n_splits`` default value changed from 3 to 5. shuffle : bool, default=False Whether to shuffle the data before splitting into batches. Note that the samples within each split will not be shuffled. random_state : int, RandomState instance or None, default=None When `shuffle` is True, `random_state` affects the ordering of the indices, which controls the randomness of each fold. Otherwise, this parameter has no effect. Pass an int for reproducible output across multiple function calls. See :term:`Glossary <random_state>`. Examples -------- >>> import numpy as np >>> from sklearn.model_selection import KFold >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) >>> y = np.array([1, 2, 3, 4]) >>> kf = KFold(n_splits=2) >>> kf.get_n_splits(X) 2 >>> print(kf) KFold(n_splits=2, random_state=None, shuffle=False) >>> for i, (train_index, test_index) in enumerate(kf.split(X)): ... print(f"Fold {i}:") ... print(f" Train: index={train_index}") ... print(f" Test: index={test_index}") Fold 0: Train: index=[2 3] Test: index=[0 1] Fold 1: Train: index=[0 1] Test: index=[2 3] Notes ----- The first ``n_samples % n_splits`` folds have size ``n_samples // n_splits + 1``, other folds have size ``n_samples // n_splits``, where ``n_samples`` is the number of samples. Randomized CV splitters may return different results for each call of split. You can make the results identical by setting `random_state` to an integer. See Also -------- StratifiedKFold : Takes class information into account to avoid building folds with imbalanced class distributions (for binary or multiclass classification tasks). GroupKFold : K-fold iterator variant with non-overlapping groups. RepeatedKFold : Repeats K-Fold n times. """def__init__(self,n_splits=5,*,shuffle=False,random_state=None):super().__init__(n_splits=n_splits,shuffle=shuffle,random_state=random_state)def_iter_test_indices(self,X,y=None,groups=None):n_samples=_num_samples(X)indices=np.arange(n_samples)ifself.shuffle:check_random_state(self.random_state).shuffle(indices)n_splits=self.n_splitsfold_sizes=np.full(n_splits,n_samples//n_splits,dtype=int)fold_sizes[:n_samples%n_splits]+=1current=0forfold_sizeinfold_sizes:start,stop=current,current+fold_sizeyieldindices[start:stop]current=stopclassGroupKFold(GroupsConsumerMixin,_BaseKFold):"""K-fold iterator variant with non-overlapping groups. Each group will appear exactly once in the test set across all folds (the number of distinct groups has to be at least equal to the number of folds). The folds are approximately balanced in the sense that the number of distinct groups is approximately the same in each fold. Read more in the :ref:`User Guide <group_k_fold>`. For visualisation of cross-validation behaviour and comparison between common scikit-learn split methods refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` Parameters ---------- n_splits : int, default=5 Number of folds. Must be at least 2. .. versionchanged:: 0.22 ``n_splits`` default value changed from 3 to 5. Notes ----- Groups appear in an arbitrary order throughout the folds. Examples -------- >>> import numpy as np >>> from sklearn.model_selection import GroupKFold >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]]) >>> y = np.array([1, 2, 3, 4, 5, 6]) >>> groups = np.array([0, 0, 2, 2, 3, 3]) >>> group_kfold = GroupKFold(n_splits=2) >>> group_kfold.get_n_splits(X, y, groups) 2 >>> print(group_kfold) GroupKFold(n_splits=2) >>> for i, (train_index, test_index) in enumerate(group_kfold.split(X, y, groups)): ... print(f"Fold {i}:") ... print(f" Train: index={train_index}, group={groups[train_index]}") ... print(f" Test: index={test_index}, group={groups[test_index]}") Fold 0: Train: index=[2 3], group=[2 2] Test: index=[0 1 4 5], group=[0 0 3 3] Fold 1: Train: index=[0 1 4 5], group=[0 0 3 3] Test: index=[2 3], group=[2 2] See Also -------- LeaveOneGroupOut : For splitting the data according to explicit domain-specific stratification of the dataset. StratifiedKFold : Takes class information into account to avoid building folds with imbalanced class proportions (for binary or multiclass classification tasks). """def__init__(self,n_splits=5):super().__init__(n_splits,shuffle=False,random_state=None)def_iter_test_indices(self,X,y,groups):ifgroupsisNone:raiseValueError("The 'groups' parameter should not be None.")groups=check_array(groups,input_name="groups",ensure_2d=False,dtype=None)unique_groups,groups=np.unique(groups,return_inverse=True)n_groups=len(unique_groups)ifself.n_splits>n_groups:raiseValueError("Cannot have number of splits n_splits=%d greater"" than the number of groups: %d."%(self.n_splits,n_groups))# Weight groups by their number of occurrencesn_samples_per_group=np.bincount(groups)# Distribute the most frequent groups firstindices=np.argsort(n_samples_per_group)[::-1]n_samples_per_group=n_samples_per_group[indices]# Total weight of each foldn_samples_per_fold=np.zeros(self.n_splits)# Mapping from group index to fold indexgroup_to_fold=np.zeros(len(unique_groups))# Distribute samples by adding the largest weight to the lightest foldforgroup_index,weightinenumerate(n_samples_per_group):lightest_fold=np.argmin(n_samples_per_fold)n_samples_per_fold[lightest_fold]+=weightgroup_to_fold[indices[group_index]]=lightest_foldindices=group_to_fold[groups]forfinrange(self.n_splits):yieldnp.where(indices==f)[0]defsplit(self,X,y=None,groups=None):"""Generate indices to split data into training and test set. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data, where `n_samples` is the number of samples and `n_features` is the number of features. y : array-like of shape (n_samples,), default=None The target variable for supervised learning problems. groups : array-like of shape (n_samples,) Group labels for the samples used while splitting the dataset into train/test set. Yields ------ train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """returnsuper().split(X,y,groups)classStratifiedKFold(_BaseKFold):"""Stratified K-Fold cross-validator. Provides train/test indices to split data in train/test sets. This cross-validation object is a variation of KFold that returns stratified folds. The folds are made by preserving the percentage of samples for each class. Read more in the :ref:`User Guide <stratified_k_fold>`. For visualisation of cross-validation behaviour and comparison between common scikit-learn split methods refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` Parameters ---------- n_splits : int, default=5 Number of folds. Must be at least 2. .. versionchanged:: 0.22 ``n_splits`` default value changed from 3 to 5. shuffle : bool, default=False Whether to shuffle each class's samples before splitting into batches. Note that the samples within each split will not be shuffled. random_state : int, RandomState instance or None, default=None When `shuffle` is True, `random_state` affects the ordering of the indices, which controls the randomness of each fold for each class. Otherwise, leave `random_state` as `None`. Pass an int for reproducible output across multiple function calls. See :term:`Glossary <random_state>`. Examples -------- >>> import numpy as np >>> from sklearn.model_selection import StratifiedKFold >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) >>> y = np.array([0, 0, 1, 1]) >>> skf = StratifiedKFold(n_splits=2) >>> skf.get_n_splits(X, y) 2 >>> print(skf) StratifiedKFold(n_splits=2, random_state=None, shuffle=False) >>> for i, (train_index, test_index) in enumerate(skf.split(X, y)): ... print(f"Fold {i}:") ... print(f" Train: index={train_index}") ... print(f" Test: index={test_index}") Fold 0: Train: index=[1 3] Test: index=[0 2] Fold 1: Train: index=[0 2] Test: index=[1 3] Notes ----- The implementation is designed to: * Generate test sets such that all contain the same distribution of classes, or as close as possible. * Be invariant to class label: relabelling ``y = ["Happy", "Sad"]`` to ``y = [1, 0]`` should not change the indices generated. * Preserve order dependencies in the dataset ordering, when ``shuffle=False``: all samples from class k in some test set were contiguous in y, or separated in y by samples from classes other than k. * Generate test sets where the smallest and largest differ by at most one sample. .. versionchanged:: 0.22 The previous implementation did not follow the last constraint. See Also -------- RepeatedStratifiedKFold : Repeats Stratified K-Fold n times. """def__init__(self,n_splits=5,*,shuffle=False,random_state=None):super().__init__(n_splits=n_splits,shuffle=shuffle,random_state=random_state)def_make_test_folds(self,X,y=None):rng=check_random_state(self.random_state)y=np.asarray(y)type_of_target_y=type_of_target(y)allowed_target_types=("binary","multiclass")iftype_of_target_ynotinallowed_target_types:raiseValueError("Supported target types are: {}. Got {!r} instead.".format(allowed_target_types,type_of_target_y))y=column_or_1d(y)_,y_idx,y_inv=np.unique(y,return_index=True,return_inverse=True)# y_inv encodes y according to lexicographic order. We invert y_idx to# map the classes so that they are encoded by order of appearance:# 0 represents the first label appearing in y, 1 the second, etc._,class_perm=np.unique(y_idx,return_inverse=True)y_encoded=class_perm[y_inv]n_classes=len(y_idx)y_counts=np.bincount(y_encoded)min_groups=np.min(y_counts)ifnp.all(self.n_splits>y_counts):raiseValueError("n_splits=%d cannot be greater than the"" number of members in each class."%(self.n_splits))ifself.n_splits>min_groups:warnings.warn("The least populated class in y has only %d"" members, which is less than n_splits=%d."%(min_groups,self.n_splits),UserWarning,)# Determine the optimal number of samples from each class in each fold,# using round robin over the sorted y. (This can be done direct from# counts, but that code is unreadable.)y_order=np.sort(y_encoded)allocation=np.asarray([np.bincount(y_order[i::self.n_splits],minlength=n_classes)foriinrange(self.n_splits)])# To maintain the data order dependencies as best as possible within# the stratification constraint, we assign samples from each class in# blocks (and then mess that up when shuffle=True).test_folds=np.empty(len(y),dtype="i")forkinrange(n_classes):# since the kth column of allocation stores the number of samples# of class k in each test set, this generates blocks of fold# indices corresponding to the allocation for class k.folds_for_class=np.arange(self.n_splits).repeat(allocation[:,k])ifself.shuffle:rng.shuffle(folds_for_class)test_folds[y_encoded==k]=folds_for_classreturntest_foldsdef_iter_test_masks(self,X,y=None,groups=None):test_folds=self._make_test_folds(X,y)foriinrange(self.n_splits):yieldtest_folds==i
[docs]defsplit(self,X,y,groups=None):"""Generate indices to split data into training and test set. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data, where `n_samples` is the number of samples and `n_features` is the number of features. Note that providing ``y`` is sufficient to generate the splits and hence ``np.zeros(n_samples)`` may be used as a placeholder for ``X`` instead of actual training data. y : array-like of shape (n_samples,) The target variable for supervised learning problems. Stratification is done based on the y labels. groups : object Always ignored, exists for compatibility. Yields ------ train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. Notes ----- Randomized CV splitters may return different results for each call of split. You can make the results identical by setting `random_state` to an integer. """y=check_array(y,input_name="y",ensure_2d=False,dtype=None)returnsuper().split(X,y,groups)
classStratifiedGroupKFold(GroupsConsumerMixin,_BaseKFold):"""Stratified K-Fold iterator variant with non-overlapping groups. This cross-validation object is a variation of StratifiedKFold attempts to return stratified folds with non-overlapping groups. The folds are made by preserving the percentage of samples for each class. Each group will appear exactly once in the test set across all folds (the number of distinct groups has to be at least equal to the number of folds). The difference between :class:`~sklearn.model_selection.GroupKFold` and :class:`~sklearn.model_selection.StratifiedGroupKFold` is that the former attempts to create balanced folds such that the number of distinct groups is approximately the same in each fold, whereas StratifiedGroupKFold attempts to create folds which preserve the percentage of samples for each class as much as possible given the constraint of non-overlapping groups between splits. Read more in the :ref:`User Guide <cross_validation>`. For visualisation of cross-validation behaviour and comparison between common scikit-learn split methods refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` Parameters ---------- n_splits : int, default=5 Number of folds. Must be at least 2. shuffle : bool, default=False Whether to shuffle each class's samples before splitting into batches. Note that the samples within each split will not be shuffled. This implementation can only shuffle groups that have approximately the same y distribution, no global shuffle will be performed. random_state : int or RandomState instance, default=None When `shuffle` is True, `random_state` affects the ordering of the indices, which controls the randomness of each fold for each class. Otherwise, leave `random_state` as `None`. Pass an int for reproducible output across multiple function calls. See :term:`Glossary <random_state>`. Examples -------- >>> import numpy as np >>> from sklearn.model_selection import StratifiedGroupKFold >>> X = np.ones((17, 2)) >>> y = np.array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]) >>> groups = np.array([1, 1, 2, 2, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 7, 8, 8]) >>> sgkf = StratifiedGroupKFold(n_splits=3) >>> sgkf.get_n_splits(X, y) 3 >>> print(sgkf) StratifiedGroupKFold(n_splits=3, random_state=None, shuffle=False) >>> for i, (train_index, test_index) in enumerate(sgkf.split(X, y, groups)): ... print(f"Fold {i}:") ... print(f" Train: index={train_index}") ... print(f" group={groups[train_index]}") ... print(f" Test: index={test_index}") ... print(f" group={groups[test_index]}") Fold 0: Train: index=[ 0 1 2 3 7 8 9 10 11 15 16] group=[1 1 2 2 4 5 5 5 5 8 8] Test: index=[ 4 5 6 12 13 14] group=[3 3 3 6 6 7] Fold 1: Train: index=[ 4 5 6 7 8 9 10 11 12 13 14] group=[3 3 3 4 5 5 5 5 6 6 7] Test: index=[ 0 1 2 3 15 16] group=[1 1 2 2 8 8] Fold 2: Train: index=[ 0 1 2 3 4 5 6 12 13 14 15 16] group=[1 1 2 2 3 3 3 6 6 7 8 8] Test: index=[ 7 8 9 10 11] group=[4 5 5 5 5] Notes ----- The implementation is designed to: * Mimic the behavior of StratifiedKFold as much as possible for trivial groups (e.g. when each group contains only one sample). * Be invariant to class label: relabelling ``y = ["Happy", "Sad"]`` to ``y = [1, 0]`` should not change the indices generated. * Stratify based on samples as much as possible while keeping non-overlapping groups constraint. That means that in some cases when there is a small number of groups containing a large number of samples the stratification will not be possible and the behavior will be close to GroupKFold. See also -------- StratifiedKFold: Takes class information into account to build folds which retain class distributions (for binary or multiclass classification tasks). GroupKFold: K-fold iterator variant with non-overlapping groups. """def__init__(self,n_splits=5,shuffle=False,random_state=None):super().__init__(n_splits=n_splits,shuffle=shuffle,random_state=random_state)def_iter_test_indices(self,X,y,groups):# Implementation is based on this kaggle kernel:# https://www.kaggle.com/jakubwasikowski/stratified-group-k-fold-cross-validation# and is a subject to Apache 2.0 License. You may obtain a copy of the# License at http://www.apache.org/licenses/LICENSE-2.0# Changelist:# - Refactored function to a class following scikit-learn KFold# interface.# - Added heuristic for assigning group to the least populated fold in# cases when all other criteria are equal# - Swtch from using python ``Counter`` to ``np.unique`` to get class# distribution# - Added scikit-learn checks for input: checking that target is binary# or multiclass, checking passed random state, checking that number# of splits is less than number of members in each class, checking# that least populated class has more members than there are splits.rng=check_random_state(self.random_state)y=np.asarray(y)type_of_target_y=type_of_target(y)allowed_target_types=("binary","multiclass")iftype_of_target_ynotinallowed_target_types:raiseValueError("Supported target types are: {}. Got {!r} instead.".format(allowed_target_types,type_of_target_y))y=column_or_1d(y)_,y_inv,y_cnt=np.unique(y,return_inverse=True,return_counts=True)ifnp.all(self.n_splits>y_cnt):raiseValueError("n_splits=%d cannot be greater than the"" number of members in each class."%(self.n_splits))n_smallest_class=np.min(y_cnt)ifself.n_splits>n_smallest_class:warnings.warn("The least populated class in y has only %d"" members, which is less than n_splits=%d."%(n_smallest_class,self.n_splits),UserWarning,)n_classes=len(y_cnt)_,groups_inv,groups_cnt=np.unique(groups,return_inverse=True,return_counts=True)y_counts_per_group=np.zeros((len(groups_cnt),n_classes))forclass_idx,group_idxinzip(y_inv,groups_inv):y_counts_per_group[group_idx,class_idx]+=1y_counts_per_fold=np.zeros((self.n_splits,n_classes))groups_per_fold=defaultdict(set)ifself.shuffle:rng.shuffle(y_counts_per_group)# Stable sort to keep shuffled order for groups with the same# class distribution variancesorted_groups_idx=np.argsort(-np.std(y_counts_per_group,axis=1),kind="mergesort")forgroup_idxinsorted_groups_idx:group_y_counts=y_counts_per_group[group_idx]best_fold=self._find_best_fold(y_counts_per_fold=y_counts_per_fold,y_cnt=y_cnt,group_y_counts=group_y_counts,)y_counts_per_fold[best_fold]+=group_y_countsgroups_per_fold[best_fold].add(group_idx)foriinrange(self.n_splits):test_indices=[idxforidx,group_idxinenumerate(groups_inv)ifgroup_idxingroups_per_fold[i]]yieldtest_indicesdef_find_best_fold(self,y_counts_per_fold,y_cnt,group_y_counts):best_fold=Nonemin_eval=np.infmin_samples_in_fold=np.infforiinrange(self.n_splits):y_counts_per_fold[i]+=group_y_counts# Summarise the distribution over classes in each proposed foldstd_per_class=np.std(y_counts_per_fold/y_cnt.reshape(1,-1),axis=0)y_counts_per_fold[i]-=group_y_countsfold_eval=np.mean(std_per_class)samples_in_fold=np.sum(y_counts_per_fold[i])is_current_fold_better=(fold_eval<min_evalornp.isclose(fold_eval,min_eval)andsamples_in_fold<min_samples_in_fold)ifis_current_fold_better:min_eval=fold_evalmin_samples_in_fold=samples_in_foldbest_fold=ireturnbest_foldclassTimeSeriesSplit(_BaseKFold):"""Time Series cross-validator. Provides train/test indices to split time series data samples that are observed at fixed time intervals, in train/test sets. In each split, test indices must be higher than before, and thus shuffling in cross validator is inappropriate. This cross-validation object is a variation of :class:`KFold`. In the kth split, it returns first k folds as train set and the (k+1)th fold as test set. Note that unlike standard cross-validation methods, successive training sets are supersets of those that come before them. Read more in the :ref:`User Guide <time_series_split>`. For visualisation of cross-validation behaviour and comparison between common scikit-learn split methods refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` .. versionadded:: 0.18 Parameters ---------- n_splits : int, default=5 Number of splits. Must be at least 2. .. versionchanged:: 0.22 ``n_splits`` default value changed from 3 to 5. max_train_size : int, default=None Maximum size for a single training set. test_size : int, default=None Used to limit the size of the test set. Defaults to ``n_samples // (n_splits + 1)``, which is the maximum allowed value with ``gap=0``. .. versionadded:: 0.24 gap : int, default=0 Number of samples to exclude from the end of each train set before the test set. .. versionadded:: 0.24 Examples -------- >>> import numpy as np >>> from sklearn.model_selection import TimeSeriesSplit >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]]) >>> y = np.array([1, 2, 3, 4, 5, 6]) >>> tscv = TimeSeriesSplit() >>> print(tscv) TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None) >>> for i, (train_index, test_index) in enumerate(tscv.split(X)): ... print(f"Fold {i}:") ... print(f" Train: index={train_index}") ... print(f" Test: index={test_index}") Fold 0: Train: index=[0] Test: index=[1] Fold 1: Train: index=[0 1] Test: index=[2] Fold 2: Train: index=[0 1 2] Test: index=[3] Fold 3: Train: index=[0 1 2 3] Test: index=[4] Fold 4: Train: index=[0 1 2 3 4] Test: index=[5] >>> # Fix test_size to 2 with 12 samples >>> X = np.random.randn(12, 2) >>> y = np.random.randint(0, 2, 12) >>> tscv = TimeSeriesSplit(n_splits=3, test_size=2) >>> for i, (train_index, test_index) in enumerate(tscv.split(X)): ... print(f"Fold {i}:") ... print(f" Train: index={train_index}") ... print(f" Test: index={test_index}") Fold 0: Train: index=[0 1 2 3 4 5] Test: index=[6 7] Fold 1: Train: index=[0 1 2 3 4 5 6 7] Test: index=[8 9] Fold 2: Train: index=[0 1 2 3 4 5 6 7 8 9] Test: index=[10 11] >>> # Add in a 2 period gap >>> tscv = TimeSeriesSplit(n_splits=3, test_size=2, gap=2) >>> for i, (train_index, test_index) in enumerate(tscv.split(X)): ... print(f"Fold {i}:") ... print(f" Train: index={train_index}") ... print(f" Test: index={test_index}") Fold 0: Train: index=[0 1 2 3] Test: index=[6 7] Fold 1: Train: index=[0 1 2 3 4 5] Test: index=[8 9] Fold 2: Train: index=[0 1 2 3 4 5 6 7] Test: index=[10 11] For a more extended example see :ref:`sphx_glr_auto_examples_applications_plot_cyclical_feature_engineering.py`. Notes ----- The training set has size ``i * n_samples // (n_splits + 1) + n_samples % (n_splits + 1)`` in the ``i`` th split, with a test set of size ``n_samples//(n_splits + 1)`` by default, where ``n_samples`` is the number of samples. """def__init__(self,n_splits=5,*,max_train_size=None,test_size=None,gap=0):super().__init__(n_splits,shuffle=False,random_state=None)self.max_train_size=max_train_sizeself.test_size=test_sizeself.gap=gapdefsplit(self,X,y=None,groups=None):"""Generate indices to split data into training and test set. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data, where `n_samples` is the number of samples and `n_features` is the number of features. y : array-like of shape (n_samples,) Always ignored, exists for compatibility. groups : array-like of shape (n_samples,) Always ignored, exists for compatibility. Yields ------ train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """X,y,groups=indexable(X,y,groups)n_samples=_num_samples(X)n_splits=self.n_splitsn_folds=n_splits+1gap=self.gaptest_size=(self.test_sizeifself.test_sizeisnotNoneelsen_samples//n_folds)# Make sure we have enough samples for the given split parametersifn_folds>n_samples:raiseValueError(f"Cannot have number of folds={n_folds} greater"f" than the number of samples={n_samples}.")ifn_samples-gap-(test_size*n_splits)<=0:raiseValueError(f"Too many splits={n_splits} for number of samples"f"={n_samples} with test_size={test_size} and gap={gap}.")indices=np.arange(n_samples)test_starts=range(n_samples-n_splits*test_size,n_samples,test_size)fortest_startintest_starts:train_end=test_start-gapifself.max_train_sizeandself.max_train_size<train_end:yield(indices[train_end-self.max_train_size:train_end],indices[test_start:test_start+test_size],)else:yield(indices[:train_end],indices[test_start:test_start+test_size],)classLeaveOneGroupOut(GroupsConsumerMixin,BaseCrossValidator):"""Leave One Group Out cross-validator. Provides train/test indices to split data such that each training set is comprised of all samples except ones belonging to one specific group. Arbitrary domain specific group information is provided an array integers that encodes the group of each sample. For instance the groups could be the year of collection of the samples and thus allow for cross-validation against time-based splits. Read more in the :ref:`User Guide <leave_one_group_out>`. Notes ----- Splits are ordered according to the index of the group left out. The first split has testing set consisting of the group whose index in `groups` is lowest, and so on. Examples -------- >>> import numpy as np >>> from sklearn.model_selection import LeaveOneGroupOut >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) >>> y = np.array([1, 2, 1, 2]) >>> groups = np.array([1, 1, 2, 2]) >>> logo = LeaveOneGroupOut() >>> logo.get_n_splits(X, y, groups) 2 >>> logo.get_n_splits(groups=groups) # 'groups' is always required 2 >>> print(logo) LeaveOneGroupOut() >>> for i, (train_index, test_index) in enumerate(logo.split(X, y, groups)): ... print(f"Fold {i}:") ... print(f" Train: index={train_index}, group={groups[train_index]}") ... print(f" Test: index={test_index}, group={groups[test_index]}") Fold 0: Train: index=[2 3], group=[2 2] Test: index=[0 1], group=[1 1] Fold 1: Train: index=[0 1], group=[1 1] Test: index=[2 3], group=[2 2] See also -------- GroupKFold: K-fold iterator variant with non-overlapping groups. """def_iter_test_masks(self,X,y,groups):ifgroupsisNone:raiseValueError("The 'groups' parameter should not be None.")# We make a copy of groups to avoid side-effects during iterationgroups=check_array(groups,input_name="groups",copy=True,ensure_2d=False,dtype=None)unique_groups=np.unique(groups)iflen(unique_groups)<=1:raiseValueError("The groups parameter contains fewer than 2 unique groups ""(%s). LeaveOneGroupOut expects at least 2."%unique_groups)foriinunique_groups:yieldgroups==idefget_n_splits(self,X=None,y=None,groups=None):"""Returns the number of splitting iterations in the cross-validator. Parameters ---------- X : object Always ignored, exists for compatibility. y : object Always ignored, exists for compatibility. groups : array-like of shape (n_samples,) Group labels for the samples used while splitting the dataset into train/test set. This 'groups' parameter must always be specified to calculate the number of splits, though the other parameters can be omitted. Returns ------- n_splits : int Returns the number of splitting iterations in the cross-validator. """ifgroupsisNone:raiseValueError("The 'groups' parameter should not be None.")groups=check_array(groups,input_name="groups",ensure_2d=False,dtype=None)returnlen(np.unique(groups))defsplit(self,X,y=None,groups=None):"""Generate indices to split data into training and test set. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data, where `n_samples` is the number of samples and `n_features` is the number of features. y : array-like of shape (n_samples,), default=None The target variable for supervised learning problems. groups : array-like of shape (n_samples,) Group labels for the samples used while splitting the dataset into train/test set. Yields ------ train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """returnsuper().split(X,y,groups)classLeavePGroupsOut(GroupsConsumerMixin,BaseCrossValidator):"""Leave P Group(s) Out cross-validator. Provides train/test indices to split data according to a third-party provided group. This group information can be used to encode arbitrary domain specific stratifications of the samples as integers. For instance the groups could be the year of collection of the samples and thus allow for cross-validation against time-based splits. The difference between LeavePGroupsOut and LeaveOneGroupOut is that the former builds the test sets with all the samples assigned to ``p`` different values of the groups while the latter uses samples all assigned the same groups. Read more in the :ref:`User Guide <leave_p_groups_out>`. Parameters ---------- n_groups : int Number of groups (``p``) to leave out in the test split. Examples -------- >>> import numpy as np >>> from sklearn.model_selection import LeavePGroupsOut >>> X = np.array([[1, 2], [3, 4], [5, 6]]) >>> y = np.array([1, 2, 1]) >>> groups = np.array([1, 2, 3]) >>> lpgo = LeavePGroupsOut(n_groups=2) >>> lpgo.get_n_splits(X, y, groups) 3 >>> lpgo.get_n_splits(groups=groups) # 'groups' is always required 3 >>> print(lpgo) LeavePGroupsOut(n_groups=2) >>> for i, (train_index, test_index) in enumerate(lpgo.split(X, y, groups)): ... print(f"Fold {i}:") ... print(f" Train: index={train_index}, group={groups[train_index]}") ... print(f" Test: index={test_index}, group={groups[test_index]}") Fold 0: Train: index=[2], group=[3] Test: index=[0 1], group=[1 2] Fold 1: Train: index=[1], group=[2] Test: index=[0 2], group=[1 3] Fold 2: Train: index=[0], group=[1] Test: index=[1 2], group=[2 3] See Also -------- GroupKFold : K-fold iterator variant with non-overlapping groups. """def__init__(self,n_groups):self.n_groups=n_groupsdef_iter_test_masks(self,X,y,groups):ifgroupsisNone:raiseValueError("The 'groups' parameter should not be None.")groups=check_array(groups,input_name="groups",copy=True,ensure_2d=False,dtype=None)unique_groups=np.unique(groups)ifself.n_groups>=len(unique_groups):raiseValueError("The groups parameter contains fewer than (or equal to) ""n_groups (%d) numbers of unique groups (%s). LeavePGroupsOut ""expects that at least n_groups + 1 (%d) unique groups be ""present"%(self.n_groups,unique_groups,self.n_groups+1))combi=combinations(range(len(unique_groups)),self.n_groups)forindicesincombi:test_index=np.zeros(_num_samples(X),dtype=bool)forlinunique_groups[np.array(indices)]:test_index[groups==l]=Trueyieldtest_indexdefget_n_splits(self,X=None,y=None,groups=None):"""Returns the number of splitting iterations in the cross-validator. Parameters ---------- X : object Always ignored, exists for compatibility. y : object Always ignored, exists for compatibility. groups : array-like of shape (n_samples,) Group labels for the samples used while splitting the dataset into train/test set. This 'groups' parameter must always be specified to calculate the number of splits, though the other parameters can be omitted. Returns ------- n_splits : int Returns the number of splitting iterations in the cross-validator. """ifgroupsisNone:raiseValueError("The 'groups' parameter should not be None.")groups=check_array(groups,input_name="groups",ensure_2d=False,dtype=None)returnint(comb(len(np.unique(groups)),self.n_groups,exact=True))defsplit(self,X,y=None,groups=None):"""Generate indices to split data into training and test set. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data, where `n_samples` is the number of samples and `n_features` is the number of features. y : array-like of shape (n_samples,), default=None The target variable for supervised learning problems. groups : array-like of shape (n_samples,) Group labels for the samples used while splitting the dataset into train/test set. Yields ------ train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """returnsuper().split(X,y,groups)class_RepeatedSplits(_MetadataRequester,metaclass=ABCMeta):"""Repeated splits for an arbitrary randomized CV splitter. Repeats splits for cross-validators n times with different randomization in each repetition. Parameters ---------- cv : callable Cross-validator class. n_repeats : int, default=10 Number of times cross-validator needs to be repeated. random_state : int, RandomState instance or None, default=None Passes `random_state` to the arbitrary repeating cross validator. Pass an int for reproducible output across multiple function calls. See :term:`Glossary <random_state>`. **cvargs : additional params Constructor parameters for cv. Must not contain random_state and shuffle. """# This indicates that by default CV splitters don't have a "groups" kwarg,# unless indicated by inheriting from ``GroupsConsumerMixin``.# This also prevents ``set_split_request`` to be generated for splitters# which don't support ``groups``.__metadata_request__split={"groups":metadata_routing.UNUSED}def__init__(self,cv,*,n_repeats=10,random_state=None,**cvargs):ifnotisinstance(n_repeats,numbers.Integral):raiseValueError("Number of repetitions must be of Integral type.")ifn_repeats<=0:raiseValueError("Number of repetitions must be greater than 0.")ifany(keyincvargsforkeyin("random_state","shuffle")):raiseValueError("cvargs must not contain random_state or shuffle.")self.cv=cvself.n_repeats=n_repeatsself.random_state=random_stateself.cvargs=cvargsdefsplit(self,X,y=None,groups=None):"""Generates indices to split data into training and test set. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data, where `n_samples` is the number of samples and `n_features` is the number of features. y : array-like of shape (n_samples,) The target variable for supervised learning problems. groups : array-like of shape (n_samples,), default=None Group labels for the samples used while splitting the dataset into train/test set. Yields ------ train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """n_repeats=self.n_repeatsrng=check_random_state(self.random_state)foridxinrange(n_repeats):cv=self.cv(random_state=rng,shuffle=True,**self.cvargs)fortrain_index,test_indexincv.split(X,y,groups):yieldtrain_index,test_indexdefget_n_splits(self,X=None,y=None,groups=None):"""Returns the number of splitting iterations in the cross-validator. Parameters ---------- X : object Always ignored, exists for compatibility. ``np.zeros(n_samples)`` may be used as a placeholder. y : object Always ignored, exists for compatibility. ``np.zeros(n_samples)`` may be used as a placeholder. groups : array-like of shape (n_samples,), default=None Group labels for the samples used while splitting the dataset into train/test set. Returns ------- n_splits : int Returns the number of splitting iterations in the cross-validator. """rng=check_random_state(self.random_state)cv=self.cv(random_state=rng,shuffle=True,**self.cvargs)returncv.get_n_splits(X,y,groups)*self.n_repeatsdef__repr__(self):return_build_repr(self)classRepeatedKFold(_RepeatedSplits):"""Repeated K-Fold cross validator. Repeats K-Fold n times with different randomization in each repetition. Read more in the :ref:`User Guide <repeated_k_fold>`. Parameters ---------- n_splits : int, default=5 Number of folds. Must be at least 2. n_repeats : int, default=10 Number of times cross-validator needs to be repeated. random_state : int, RandomState instance or None, default=None Controls the randomness of each repeated cross-validation instance. Pass an int for reproducible output across multiple function calls. See :term:`Glossary <random_state>`. Examples -------- >>> import numpy as np >>> from sklearn.model_selection import RepeatedKFold >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) >>> y = np.array([0, 0, 1, 1]) >>> rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=2652124) >>> rkf.get_n_splits(X, y) 4 >>> print(rkf) RepeatedKFold(n_repeats=2, n_splits=2, random_state=2652124) >>> for i, (train_index, test_index) in enumerate(rkf.split(X)): ... print(f"Fold {i}:") ... print(f" Train: index={train_index}") ... print(f" Test: index={test_index}") ... Fold 0: Train: index=[0 1] Test: index=[2 3] Fold 1: Train: index=[2 3] Test: index=[0 1] Fold 2: Train: index=[1 2] Test: index=[0 3] Fold 3: Train: index=[0 3] Test: index=[1 2] Notes ----- Randomized CV splitters may return different results for each call of split. You can make the results identical by setting `random_state` to an integer. See Also -------- RepeatedStratifiedKFold : Repeats Stratified K-Fold n times. """def__init__(self,*,n_splits=5,n_repeats=10,random_state=None):super().__init__(KFold,n_repeats=n_repeats,random_state=random_state,n_splits=n_splits)classRepeatedStratifiedKFold(_RepeatedSplits):"""Repeated Stratified K-Fold cross validator. Repeats Stratified K-Fold n times with different randomization in each repetition. Read more in the :ref:`User Guide <repeated_k_fold>`. Parameters ---------- n_splits : int, default=5 Number of folds. Must be at least 2. n_repeats : int, default=10 Number of times cross-validator needs to be repeated. random_state : int, RandomState instance or None, default=None Controls the generation of the random states for each repetition. Pass an int for reproducible output across multiple function calls. See :term:`Glossary <random_state>`. Examples -------- >>> import numpy as np >>> from sklearn.model_selection import RepeatedStratifiedKFold >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) >>> y = np.array([0, 0, 1, 1]) >>> rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=2, ... random_state=36851234) >>> rskf.get_n_splits(X, y) 4 >>> print(rskf) RepeatedStratifiedKFold(n_repeats=2, n_splits=2, random_state=36851234) >>> for i, (train_index, test_index) in enumerate(rskf.split(X, y)): ... print(f"Fold {i}:") ... print(f" Train: index={train_index}") ... print(f" Test: index={test_index}") ... Fold 0: Train: index=[1 2] Test: index=[0 3] Fold 1: Train: index=[0 3] Test: index=[1 2] Fold 2: Train: index=[1 3] Test: index=[0 2] Fold 3: Train: index=[0 2] Test: index=[1 3] Notes ----- Randomized CV splitters may return different results for each call of split. You can make the results identical by setting `random_state` to an integer. See Also -------- RepeatedKFold : Repeats K-Fold n times. """def__init__(self,*,n_splits=5,n_repeats=10,random_state=None):super().__init__(StratifiedKFold,n_repeats=n_repeats,random_state=random_state,n_splits=n_splits,)classBaseShuffleSplit(_MetadataRequester,metaclass=ABCMeta):"""Base class for ShuffleSplit and StratifiedShuffleSplit."""# This indicates that by default CV splitters don't have a "groups" kwarg,# unless indicated by inheriting from ``GroupsConsumerMixin``.# This also prevents ``set_split_request`` to be generated for splitters# which don't support ``groups``.__metadata_request__split={"groups":metadata_routing.UNUSED}def__init__(self,n_splits=10,*,test_size=None,train_size=None,random_state=None):self.n_splits=n_splitsself.test_size=test_sizeself.train_size=train_sizeself.random_state=random_stateself._default_test_size=0.1defsplit(self,X,y=None,groups=None):"""Generate indices to split data into training and test set. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data, where `n_samples` is the number of samples and `n_features` is the number of features. y : array-like of shape (n_samples,) The target variable for supervised learning problems. groups : array-like of shape (n_samples,), default=None Group labels for the samples used while splitting the dataset into train/test set. Yields ------ train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. Notes ----- Randomized CV splitters may return different results for each call of split. You can make the results identical by setting `random_state` to an integer. """X,y,groups=indexable(X,y,groups)fortrain,testinself._iter_indices(X,y,groups):yieldtrain,test@abstractmethoddef_iter_indices(self,X,y=None,groups=None):"""Generate (train, test) indices"""defget_n_splits(self,X=None,y=None,groups=None):"""Returns the number of splitting iterations in the cross-validator. Parameters ---------- X : object Always ignored, exists for compatibility. y : object Always ignored, exists for compatibility. groups : object Always ignored, exists for compatibility. Returns ------- n_splits : int Returns the number of splitting iterations in the cross-validator. """returnself.n_splitsdef__repr__(self):return_build_repr(self)classShuffleSplit(BaseShuffleSplit):"""Random permutation cross-validator. Yields indices to split data into training and test sets. Note: contrary to other cross-validation strategies, random splits do not guarantee that all folds will be different, although this is still very likely for sizeable datasets. Read more in the :ref:`User Guide <ShuffleSplit>`. For visualisation of cross-validation behaviour and comparison between common scikit-learn split methods refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` Parameters ---------- n_splits : int, default=10 Number of re-shuffling & splitting iterations. test_size : float or int, default=None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. If None, the value is set to the complement of the train size. If ``train_size`` is also None, it will be set to 0.1. train_size : float or int, default=None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the absolute number of train samples. If None, the value is automatically set to the complement of the test size. random_state : int, RandomState instance or None, default=None Controls the randomness of the training and testing indices produced. Pass an int for reproducible output across multiple function calls. See :term:`Glossary <random_state>`. Examples -------- >>> import numpy as np >>> from sklearn.model_selection import ShuffleSplit >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [3, 4], [5, 6]]) >>> y = np.array([1, 2, 1, 2, 1, 2]) >>> rs = ShuffleSplit(n_splits=5, test_size=.25, random_state=0) >>> rs.get_n_splits(X) 5 >>> print(rs) ShuffleSplit(n_splits=5, random_state=0, test_size=0.25, train_size=None) >>> for i, (train_index, test_index) in enumerate(rs.split(X)): ... print(f"Fold {i}:") ... print(f" Train: index={train_index}") ... print(f" Test: index={test_index}") Fold 0: Train: index=[1 3 0 4] Test: index=[5 2] Fold 1: Train: index=[4 0 2 5] Test: index=[1 3] Fold 2: Train: index=[1 2 4 0] Test: index=[3 5] Fold 3: Train: index=[3 4 1 0] Test: index=[5 2] Fold 4: Train: index=[3 5 1 0] Test: index=[2 4] >>> # Specify train and test size >>> rs = ShuffleSplit(n_splits=5, train_size=0.5, test_size=.25, ... random_state=0) >>> for i, (train_index, test_index) in enumerate(rs.split(X)): ... print(f"Fold {i}:") ... print(f" Train: index={train_index}") ... print(f" Test: index={test_index}") Fold 0: Train: index=[1 3 0] Test: index=[5 2] Fold 1: Train: index=[4 0 2] Test: index=[1 3] Fold 2: Train: index=[1 2 4] Test: index=[3 5] Fold 3: Train: index=[3 4 1] Test: index=[5 2] Fold 4: Train: index=[3 5 1] Test: index=[2 4] """def__init__(self,n_splits=10,*,test_size=None,train_size=None,random_state=None):super().__init__(n_splits=n_splits,test_size=test_size,train_size=train_size,random_state=random_state,)self._default_test_size=0.1def_iter_indices(self,X,y=None,groups=None):n_samples=_num_samples(X)n_train,n_test=_validate_shuffle_split(n_samples,self.test_size,self.train_size,default_test_size=self._default_test_size,)rng=check_random_state(self.random_state)foriinrange(self.n_splits):# random partitionpermutation=rng.permutation(n_samples)ind_test=permutation[:n_test]ind_train=permutation[n_test:(n_test+n_train)]yieldind_train,ind_testclassGroupShuffleSplit(GroupsConsumerMixin,ShuffleSplit):"""Shuffle-Group(s)-Out cross-validation iterator. Provides randomized train/test indices to split data according to a third-party provided group. This group information can be used to encode arbitrary domain specific stratifications of the samples as integers. For instance the groups could be the year of collection of the samples and thus allow for cross-validation against time-based splits. The difference between LeavePGroupsOut and GroupShuffleSplit is that the former generates splits using all subsets of size ``p`` unique groups, whereas GroupShuffleSplit generates a user-determined number of random test splits, each with a user-determined fraction of unique groups. For example, a less computationally intensive alternative to ``LeavePGroupsOut(p=10)`` would be ``GroupShuffleSplit(test_size=10, n_splits=100)``. Note: The parameters ``test_size`` and ``train_size`` refer to groups, and not to samples, as in ShuffleSplit. Read more in the :ref:`User Guide <group_shuffle_split>`. For visualisation of cross-validation behaviour and comparison between common scikit-learn split methods refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` Parameters ---------- n_splits : int, default=5 Number of re-shuffling & splitting iterations. test_size : float, int, default=0.2 If float, should be between 0.0 and 1.0 and represent the proportion of groups to include in the test split (rounded up). If int, represents the absolute number of test groups. If None, the value is set to the complement of the train size. The default will change in version 0.21. It will remain 0.2 only if ``train_size`` is unspecified, otherwise it will complement the specified ``train_size``. train_size : float or int, default=None If float, should be between 0.0 and 1.0 and represent the proportion of the groups to include in the train split. If int, represents the absolute number of train groups. If None, the value is automatically set to the complement of the test size. random_state : int, RandomState instance or None, default=None Controls the randomness of the training and testing indices produced. Pass an int for reproducible output across multiple function calls. See :term:`Glossary <random_state>`. Examples -------- >>> import numpy as np >>> from sklearn.model_selection import GroupShuffleSplit >>> X = np.ones(shape=(8, 2)) >>> y = np.ones(shape=(8, 1)) >>> groups = np.array([1, 1, 2, 2, 2, 3, 3, 3]) >>> print(groups.shape) (8,) >>> gss = GroupShuffleSplit(n_splits=2, train_size=.7, random_state=42) >>> gss.get_n_splits() 2 >>> print(gss) GroupShuffleSplit(n_splits=2, random_state=42, test_size=None, train_size=0.7) >>> for i, (train_index, test_index) in enumerate(gss.split(X, y, groups)): ... print(f"Fold {i}:") ... print(f" Train: index={train_index}, group={groups[train_index]}") ... print(f" Test: index={test_index}, group={groups[test_index]}") Fold 0: Train: index=[2 3 4 5 6 7], group=[2 2 2 3 3 3] Test: index=[0 1], group=[1 1] Fold 1: Train: index=[0 1 5 6 7], group=[1 1 3 3 3] Test: index=[2 3 4], group=[2 2 2] See Also -------- ShuffleSplit : Shuffles samples to create independent test/train sets. LeavePGroupsOut : Train set leaves out all possible subsets of `p` groups. """def__init__(self,n_splits=5,*,test_size=None,train_size=None,random_state=None):super().__init__(n_splits=n_splits,test_size=test_size,train_size=train_size,random_state=random_state,)self._default_test_size=0.2def_iter_indices(self,X,y,groups):ifgroupsisNone:raiseValueError("The 'groups' parameter should not be None.")groups=check_array(groups,input_name="groups",ensure_2d=False,dtype=None)classes,group_indices=np.unique(groups,return_inverse=True)forgroup_train,group_testinsuper()._iter_indices(X=classes):# these are the indices of classes in the partition# invert them into data indicestrain=np.flatnonzero(np.isin(group_indices,group_train))test=np.flatnonzero(np.isin(group_indices,group_test))yieldtrain,testdefsplit(self,X,y=None,groups=None):"""Generate indices to split data into training and test set. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data, where `n_samples` is the number of samples and `n_features` is the number of features. y : array-like of shape (n_samples,), default=None The target variable for supervised learning problems. groups : array-like of shape (n_samples,) Group labels for the samples used while splitting the dataset into train/test set. Yields ------ train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. Notes ----- Randomized CV splitters may return different results for each call of split. You can make the results identical by setting `random_state` to an integer. """returnsuper().split(X,y,groups)classStratifiedShuffleSplit(BaseShuffleSplit):"""Stratified ShuffleSplit cross-validator. Provides train/test indices to split data in train/test sets. This cross-validation object is a merge of StratifiedKFold and ShuffleSplit, which returns stratified randomized folds. The folds are made by preserving the percentage of samples for each class. Note: like the ShuffleSplit strategy, stratified random splits do not guarantee that all folds will be different, although this is still very likely for sizeable datasets. Read more in the :ref:`User Guide <stratified_shuffle_split>`. For visualisation of cross-validation behaviour and comparison between common scikit-learn split methods refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py` Parameters ---------- n_splits : int, default=10 Number of re-shuffling & splitting iterations. test_size : float or int, default=None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. If None, the value is set to the complement of the train size. If ``train_size`` is also None, it will be set to 0.1. train_size : float or int, default=None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the absolute number of train samples. If None, the value is automatically set to the complement of the test size. random_state : int, RandomState instance or None, default=None Controls the randomness of the training and testing indices produced. Pass an int for reproducible output across multiple function calls. See :term:`Glossary <random_state>`. Examples -------- >>> import numpy as np >>> from sklearn.model_selection import StratifiedShuffleSplit >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]]) >>> y = np.array([0, 0, 0, 1, 1, 1]) >>> sss = StratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=0) >>> sss.get_n_splits(X, y) 5 >>> print(sss) StratifiedShuffleSplit(n_splits=5, random_state=0, ...) >>> for i, (train_index, test_index) in enumerate(sss.split(X, y)): ... print(f"Fold {i}:") ... print(f" Train: index={train_index}") ... print(f" Test: index={test_index}") Fold 0: Train: index=[5 2 3] Test: index=[4 1 0] Fold 1: Train: index=[5 1 4] Test: index=[0 2 3] Fold 2: Train: index=[5 0 2] Test: index=[4 3 1] Fold 3: Train: index=[4 1 0] Test: index=[2 3 5] Fold 4: Train: index=[0 5 1] Test: index=[3 4 2] """def__init__(self,n_splits=10,*,test_size=None,train_size=None,random_state=None):super().__init__(n_splits=n_splits,test_size=test_size,train_size=train_size,random_state=random_state,)self._default_test_size=0.1def_iter_indices(self,X,y,groups=None):n_samples=_num_samples(X)y=check_array(y,input_name="y",ensure_2d=False,dtype=None)n_train,n_test=_validate_shuffle_split(n_samples,self.test_size,self.train_size,default_test_size=self._default_test_size,)ify.ndim==2:# for multi-label y, map each distinct row to a string repr# using join because str(row) uses an ellipsis if len(row) > 1000y=np.array([" ".join(row.astype("str"))forrowiny])classes,y_indices=np.unique(y,return_inverse=True)n_classes=classes.shape[0]class_counts=np.bincount(y_indices)ifnp.min(class_counts)<2:raiseValueError("The least populated class in y has only 1"" member, which is too few. The minimum"" number of groups for any class cannot"" be less than 2.")ifn_train<n_classes:raiseValueError("The train_size = %d should be greater or ""equal to the number of classes = %d"%(n_train,n_classes))ifn_test<n_classes:raiseValueError("The test_size = %d should be greater or ""equal to the number of classes = %d"%(n_test,n_classes))# Find the sorted list of instances for each class:# (np.unique above performs a sort, so code is O(n logn) already)class_indices=np.split(np.argsort(y_indices,kind="mergesort"),np.cumsum(class_counts)[:-1])rng=check_random_state(self.random_state)for_inrange(self.n_splits):# if there are ties in the class-counts, we want# to make sure to break them anew in each iterationn_i=_approximate_mode(class_counts,n_train,rng)class_counts_remaining=class_counts-n_it_i=_approximate_mode(class_counts_remaining,n_test,rng)train=[]test=[]foriinrange(n_classes):permutation=rng.permutation(class_counts[i])perm_indices_class_i=class_indices[i].take(permutation,mode="clip")train.extend(perm_indices_class_i[:n_i[i]])test.extend(perm_indices_class_i[n_i[i]:n_i[i]+t_i[i]])train=rng.permutation(train)test=rng.permutation(test)yieldtrain,testdefsplit(self,X,y,groups=None):"""Generate indices to split data into training and test set. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data, where `n_samples` is the number of samples and `n_features` is the number of features. Note that providing ``y`` is sufficient to generate the splits and hence ``np.zeros(n_samples)`` may be used as a placeholder for ``X`` instead of actual training data. y : array-like of shape (n_samples,) or (n_samples, n_labels) The target variable for supervised learning problems. Stratification is done based on the y labels. groups : object Always ignored, exists for compatibility. Yields ------ train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. Notes ----- Randomized CV splitters may return different results for each call of split. You can make the results identical by setting `random_state` to an integer. """y=check_array(y,input_name="y",ensure_2d=False,dtype=None)returnsuper().split(X,y,groups)def_validate_shuffle_split(n_samples,test_size,train_size,default_test_size=None):""" Validation helper to check if the test/test sizes are meaningful w.r.t. the size of the data (n_samples). """iftest_sizeisNoneandtrain_sizeisNone:test_size=default_test_sizetest_size_type=np.asarray(test_size).dtype.kindtrain_size_type=np.asarray(train_size).dtype.kindif(test_size_type=="i"and(test_size>=n_samplesortest_size<=0)ortest_size_type=="f"and(test_size<=0ortest_size>=1)):raiseValueError("test_size={0} should be either positive and smaller"" than the number of samples {1} or a float in the ""(0, 1) range".format(test_size,n_samples))if(train_size_type=="i"and(train_size>=n_samplesortrain_size<=0)ortrain_size_type=="f"and(train_size<=0ortrain_size>=1)):raiseValueError("train_size={0} should be either positive and smaller"" than the number of samples {1} or a float in the ""(0, 1) range".format(train_size,n_samples))iftrain_sizeisnotNoneandtrain_size_typenotin("i","f"):raiseValueError("Invalid value for train_size: {}".format(train_size))iftest_sizeisnotNoneandtest_size_typenotin("i","f"):raiseValueError("Invalid value for test_size: {}".format(test_size))iftrain_size_type=="f"andtest_size_type=="f"andtrain_size+test_size>1:raiseValueError("The sum of test_size and train_size = {}, should be in the (0, 1)"" range. Reduce test_size and/or train_size.".format(train_size+test_size))iftest_size_type=="f":n_test=ceil(test_size*n_samples)eliftest_size_type=="i":n_test=float(test_size)iftrain_size_type=="f":n_train=floor(train_size*n_samples)eliftrain_size_type=="i":n_train=float(train_size)iftrain_sizeisNone:n_train=n_samples-n_testeliftest_sizeisNone:n_test=n_samples-n_trainifn_train+n_test>n_samples:raiseValueError("The sum of train_size and test_size = %d, ""should be smaller than the number of ""samples %d. Reduce test_size and/or ""train_size."%(n_train+n_test,n_samples))n_train,n_test=int(n_train),int(n_test)ifn_train==0:raiseValueError("With n_samples={}, test_size={} and train_size={}, the ""resulting train set will be empty. Adjust any of the ""aforementioned parameters.".format(n_samples,test_size,train_size))returnn_train,n_testclassPredefinedSplit(BaseCrossValidator):"""Predefined split cross-validator. Provides train/test indices to split data into train/test sets using a predefined scheme specified by the user with the ``test_fold`` parameter. Read more in the :ref:`User Guide <predefined_split>`. .. versionadded:: 0.16 Parameters ---------- test_fold : array-like of shape (n_samples,) The entry ``test_fold[i]`` represents the index of the test set that sample ``i`` belongs to. It is possible to exclude sample ``i`` from any test set (i.e. include sample ``i`` in every training set) by setting ``test_fold[i]`` equal to -1. Examples -------- >>> import numpy as np >>> from sklearn.model_selection import PredefinedSplit >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) >>> y = np.array([0, 0, 1, 1]) >>> test_fold = [0, 1, -1, 1] >>> ps = PredefinedSplit(test_fold) >>> ps.get_n_splits() 2 >>> print(ps) PredefinedSplit(test_fold=array([ 0, 1, -1, 1])) >>> for i, (train_index, test_index) in enumerate(ps.split()): ... print(f"Fold {i}:") ... print(f" Train: index={train_index}") ... print(f" Test: index={test_index}") Fold 0: Train: index=[1 2 3] Test: index=[0] Fold 1: Train: index=[0 2] Test: index=[1 3] """def__init__(self,test_fold):self.test_fold=np.array(test_fold,dtype=int)self.test_fold=column_or_1d(self.test_fold)self.unique_folds=np.unique(self.test_fold)self.unique_folds=self.unique_folds[self.unique_folds!=-1]defsplit(self,X=None,y=None,groups=None):"""Generate indices to split data into training and test set. Parameters ---------- X : object Always ignored, exists for compatibility. y : object Always ignored, exists for compatibility. groups : object Always ignored, exists for compatibility. Yields ------ train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """ind=np.arange(len(self.test_fold))fortest_indexinself._iter_test_masks():train_index=ind[np.logical_not(test_index)]test_index=ind[test_index]yieldtrain_index,test_indexdef_iter_test_masks(self):"""Generates boolean masks corresponding to test sets."""forfinself.unique_folds:test_index=np.where(self.test_fold==f)[0]test_mask=np.zeros(len(self.test_fold),dtype=bool)test_mask[test_index]=Trueyieldtest_maskdefget_n_splits(self,X=None,y=None,groups=None):"""Returns the number of splitting iterations in the cross-validator. Parameters ---------- X : object Always ignored, exists for compatibility. y : object Always ignored, exists for compatibility. groups : object Always ignored, exists for compatibility. Returns ------- n_splits : int Returns the number of splitting iterations in the cross-validator. """returnlen(self.unique_folds)class_CVIterableWrapper(BaseCrossValidator):"""Wrapper class for old style cv objects and iterables."""def__init__(self,cv):self.cv=list(cv)defget_n_splits(self,X=None,y=None,groups=None):"""Returns the number of splitting iterations in the cross-validator. Parameters ---------- X : object Always ignored, exists for compatibility. y : object Always ignored, exists for compatibility. groups : object Always ignored, exists for compatibility. Returns ------- n_splits : int Returns the number of splitting iterations in the cross-validator. """returnlen(self.cv)defsplit(self,X=None,y=None,groups=None):"""Generate indices to split data into training and test set. Parameters ---------- X : object Always ignored, exists for compatibility. y : object Always ignored, exists for compatibility. groups : object Always ignored, exists for compatibility. Yields ------ train : ndarray The training set indices for that split. test : ndarray The testing set indices for that split. """fortrain,testinself.cv:yieldtrain,testdefcheck_cv(cv=5,y=None,*,classifier=False):"""Input checker utility for building a cross-validator. Parameters ---------- cv : int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 5-fold cross validation, - integer, to specify the number of folds. - :term:`CV splitter`, - An iterable that generates (train, test) splits as arrays of indices. For integer/None inputs, if classifier is True and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all other cases, :class:`KFold` is used. Refer :ref:`User Guide <cross_validation>` for the various cross-validation strategies that can be used here. .. versionchanged:: 0.22 ``cv`` default value changed from 3-fold to 5-fold. y : array-like, default=None The target variable for supervised learning problems. classifier : bool, default=False Whether the task is a classification task, in which case stratified KFold will be used. Returns ------- checked_cv : a cross-validator instance. The return value is a cross-validator which generates the train/test splits via the ``split`` method. Examples -------- >>> from sklearn.model_selection import check_cv >>> check_cv(cv=5, y=None, classifier=False) KFold(...) >>> check_cv(cv=5, y=[1, 1, 0, 0, 0, 0], classifier=True) StratifiedKFold(...) """cv=5ifcvisNoneelsecvifisinstance(cv,numbers.Integral):if(classifierand(yisnotNone)and(type_of_target(y,input_name="y")in("binary","multiclass"))):returnStratifiedKFold(cv)else:returnKFold(cv)ifnothasattr(cv,"split")orisinstance(cv,str):ifnotisinstance(cv,Iterable)orisinstance(cv,str):raiseValueError("Expected cv as an integer, cross-validation ""object (from sklearn.model_selection) ""or an iterable. Got %s."%cv)return_CVIterableWrapper(cv)returncv# New style cv objects are passed without any modification@validate_params({"test_size":[Interval(RealNotInt,0,1,closed="neither"),Interval(numbers.Integral,1,None,closed="left"),None,],"train_size":[Interval(RealNotInt,0,1,closed="neither"),Interval(numbers.Integral,1,None,closed="left"),None,],"random_state":["random_state"],"shuffle":["boolean"],"stratify":["array-like",None],},prefer_skip_nested_validation=True,)deftrain_test_split(*arrays,test_size=None,train_size=None,random_state=None,shuffle=True,stratify=None,):"""Split arrays or matrices into random train and test subsets. Quick utility that wraps input validation, ``next(ShuffleSplit().split(X, y))``, and application to input data into a single call for splitting (and optionally subsampling) data into a one-liner. Read more in the :ref:`User Guide <cross_validation>`. Parameters ---------- *arrays : sequence of indexables with same length / shape[0] Allowed inputs are lists, numpy arrays, scipy-sparse matrices or pandas dataframes. test_size : float or int, default=None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. If None, the value is set to the complement of the train size. If ``train_size`` is also None, it will be set to 0.25. train_size : float or int, default=None If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the absolute number of train samples. If None, the value is automatically set to the complement of the test size. random_state : int, RandomState instance or None, default=None Controls the shuffling applied to the data before applying the split. Pass an int for reproducible output across multiple function calls. See :term:`Glossary <random_state>`. shuffle : bool, default=True Whether or not to shuffle the data before splitting. If shuffle=False then stratify must be None. stratify : array-like, default=None If not None, data is split in a stratified fashion, using this as the class labels. Read more in the :ref:`User Guide <stratification>`. Returns ------- splitting : list, length=2 * len(arrays) List containing train-test split of inputs. .. versionadded:: 0.16 If the input is sparse, the output will be a ``scipy.sparse.csr_matrix``. Else, output type is the same as the input type. Examples -------- >>> import numpy as np >>> from sklearn.model_selection import train_test_split >>> X, y = np.arange(10).reshape((5, 2)), range(5) >>> X array([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]]) >>> list(y) [0, 1, 2, 3, 4] >>> X_train, X_test, y_train, y_test = train_test_split( ... X, y, test_size=0.33, random_state=42) ... >>> X_train array([[4, 5], [0, 1], [6, 7]]) >>> y_train [2, 0, 3] >>> X_test array([[2, 3], [8, 9]]) >>> y_test [1, 4] >>> train_test_split(y, shuffle=False) [[0, 1, 2], [3, 4]] """n_arrays=len(arrays)ifn_arrays==0:raiseValueError("At least one array required as input")arrays=indexable(*arrays)n_samples=_num_samples(arrays[0])n_train,n_test=_validate_shuffle_split(n_samples,test_size,train_size,default_test_size=0.25)ifshuffleisFalse:ifstratifyisnotNone:raiseValueError("Stratified train/test split is not implemented for shuffle=False")train=np.arange(n_train)test=np.arange(n_train,n_train+n_test)else:ifstratifyisnotNone:CVClass=StratifiedShuffleSplitelse:CVClass=ShuffleSplitcv=CVClass(test_size=n_test,train_size=n_train,random_state=random_state)train,test=next(cv.split(X=arrays[0],y=stratify))returnlist(chain.from_iterable((_safe_indexing(a,train),_safe_indexing(a,test))forainarrays))# Tell nose that train_test_split is not a test.# (Needed for external libraries that may use nose.)# Use setattr to avoid mypy errors when monkeypatching.setattr(train_test_split,"__test__",False)def_pprint(params,offset=0,printer=repr):"""Pretty print the dictionary 'params' Parameters ---------- params : dict The dictionary to pretty print offset : int, default=0 The offset in characters to add at the begin of each line. printer : callable, default=repr The function to convert entries to strings, typically the builtin str or repr """# Do a multi-line justified repr:options=np.get_printoptions()np.set_printoptions(precision=5,threshold=64,edgeitems=2)params_list=list()this_line_length=offsetline_sep=",\n"+(1+offset//2)*" "fori,(k,v)inenumerate(sorted(params.items())):ifisinstance(v,float):# use str for representing floating point numbers# this way we get consistent representation across# architectures and versions.this_repr="%s=%s"%(k,str(v))else:# use repr of the restthis_repr="%s=%s"%(k,printer(v))iflen(this_repr)>500:this_repr=this_repr[:300]+"..."+this_repr[-100:]ifi>0:ifthis_line_length+len(this_repr)>=75or"\n"inthis_repr:params_list.append(line_sep)this_line_length=len(line_sep)else:params_list.append(", ")this_line_length+=2params_list.append(this_repr)this_line_length+=len(this_repr)np.set_printoptions(**options)lines="".join(params_list)# Strip trailing space to avoid nightmare in doctestslines="\n".join(l.rstrip(" ")forlinlines.split("\n"))returnlinesdef_build_repr(self):# XXX This is copied from BaseEstimator's get_paramscls=self.__class__init=getattr(cls.__init__,"deprecated_original",cls.__init__)# Ignore varargs, kw and default values and pop selfinit_signature=signature(init)# Consider the constructor parameters excluding 'self'ifinitisobject.__init__:args=[]else:args=sorted([p.nameforpininit_signature.parameters.values()ifp.name!="self"andp.kind!=p.VAR_KEYWORD])class_name=self.__class__.__name__params=dict()forkeyinargs:# We need deprecation warnings to always be on in order to# catch deprecated param values.# This is set in utils/__init__.py but it gets overwritten# when running under python3 somehow.warnings.simplefilter("always",FutureWarning)try:withwarnings.catch_warnings(record=True)asw:value=getattr(self,key,None)ifvalueisNoneandhasattr(self,"cvargs"):value=self.cvargs.get(key,None)iflen(w)andw[0].category==FutureWarning:# if the parameter is deprecated, don't show itcontinuefinally:warnings.filters.pop(0)params[key]=valuereturn"%s(%s)"%(class_name,_pprint(params,offset=len(class_name)))def_yields_constant_splits(cv):# Return True if calling cv.split() always returns the same splits# We assume that if a cv doesn't have a shuffle parameter, it shuffles by# default (e.g. ShuffleSplit). If it actually doesn't shuffle (e.g.# LeaveOneOut), then it won't have a random_state parameter anyway, in# which case it will default to 0, leading to output=Trueshuffle=getattr(cv,"shuffle",True)random_state=getattr(cv,"random_state",0)returnisinstance(random_state,numbers.Integral)ornotshuffle