Source code for evalml.preprocessing.data_splitters.training_validation_split

"""Training Validation Split class."""
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection._split import BaseCrossValidator


[docs]class TrainingValidationSplit(BaseCrossValidator): """Split the training data into training and validation sets. Args: test_size (float): What percentage of data points should be included in the validation set. Defalts to the complement of `train_size` if `train_size` is set, and 0.25 otherwise. train_size (float): What percentage of data points should be included in the training set. Defaults to the complement of `test_size` shuffle (boolean): Whether to shuffle the data before splitting. Defaults to False. stratify (list): Splits the data in a stratified fashion, using this argument as class labels. Defaults to None. random_seed (int): The seed to use for random sampling. Defaults to 0. Examples: >>> import numpy as np >>> import pandas as pd ... >>> X = pd.DataFrame([i for i in range(10)], columns=["First"]) >>> y = pd.Series([i for i in range(10)]) ... >>> tv_split = TrainingValidationSplit() >>> split_ = next(tv_split.split(X, y)) >>> assert (split_[0] == np.array([0, 1, 2, 3, 4, 5, 6])).all() >>> assert (split_[1] == np.array([7, 8, 9])).all() ... ... >>> tv_split = TrainingValidationSplit(test_size=0.5) >>> split_ = next(tv_split.split(X, y)) >>> assert (split_[0] == np.array([0, 1, 2, 3, 4])).all() >>> assert (split_[1] == np.array([5, 6, 7, 8, 9])).all() ... ... >>> tv_split = TrainingValidationSplit(shuffle=True) >>> split_ = next(tv_split.split(X, y)) >>> assert (split_[0] == np.array([9, 1, 6, 7, 3, 0, 5])).all() >>> assert (split_[1] == np.array([2, 8, 4])).all() ... ... >>> y = pd.Series([i % 3 for i in range(10)]) >>> tv_split = TrainingValidationSplit(shuffle=True, stratify=y) >>> split_ = next(tv_split.split(X, y)) >>> assert (split_[0] == np.array([1, 9, 3, 2, 8, 6, 7])).all() >>> assert (split_[1] == np.array([0, 4, 5])).all() """ def __init__( self, test_size=None, train_size=None, shuffle=False, stratify=None, random_seed=0, ): self.test_size = test_size self.train_size = train_size self.shuffle = shuffle self.stratify = stratify self.random_seed = random_seed
[docs] @staticmethod def get_n_splits(): """Return the number of splits of this object. Returns: int: Always returns 1. """ return 1
[docs] def split(self, X, y=None): """Divide the data into training and testing sets. Args: X (pd.DataFrame): Dataframe of points to split y (pd.Series): Series of points to split Returns: list: Indices to split data into training and test set """ train, test = train_test_split( np.arange(X.shape[0]), test_size=self.test_size, train_size=self.train_size, shuffle=self.shuffle, stratify=self.stratify, random_state=self.random_seed, ) return iter([(train, test)])