Source code for evalml.data_checks.ts_splitting_data_check
"""Data check that checks whether the time series training and validation splits have adequate class representation."""
from sklearn.model_selection import TimeSeriesSplit as SkTimeSeriesSplit
from evalml.data_checks import DataCheck, DataCheckError, DataCheckMessageCode
from evalml.problem_types import ProblemTypes, handle_problem_types
from evalml.utils import infer_feature_types
[docs]class TimeSeriesSplittingDataCheck(DataCheck):
"""Checks whether the time series target data is compatible with splitting.
If the target data in the training and validation of every split doesn't have representation from
all classes (for time series classification problems) this will prevent the estimators from training
on all potential outcomes which will cause errors during prediction.
Args:
problem_type (str or ProblemTypes): Problem type.
n_splits (int): Number of time series splits.
"""
def __init__(self, problem_type, n_splits):
self.problem_type = problem_type
if handle_problem_types(self.problem_type) not in [
ProblemTypes.TIME_SERIES_BINARY,
ProblemTypes.TIME_SERIES_MULTICLASS,
]:
raise ValueError(
"Valid splitting of labels in time series is only defined for time series binary and time series multiclass problem types.",
)
self.n_splits = n_splits
self._splitter = SkTimeSeriesSplit(n_splits=self.n_splits)
[docs] def validate(self, X, y):
"""Check if the training and validation targets are compatible with time series data splitting.
Args:
X (pd.DataFrame, np.ndarray): Ignored. Features.
y (pd.Series, np.ndarray): Target data.
Returns:
dict: dict with a DataCheckError if splitting would result in inadequate class representation.
Example:
>>> import pandas as pd
Passing n_splits as 3 means that the data will be segmented into 4 parts to be iterated over for training
and validation splits. The first split results in training indices of [0:25] and validation indices of [25:50].
The training indices of the first split result in only one unique value (0).
The third split results in training indices of [0:75] and validation indices of [75:100]. The validation indices
of the third split result in only one unique value (1).
>>> X = None
>>> y = pd.Series([0 if i < 45 else i % 2 if i < 55 else 1 for i in range(100)])
>>> ts_splitting_check = TimeSeriesSplittingDataCheck("time series binary", 3)
>>> assert ts_splitting_check.validate(X, y) == [
... {
... "message": "Time Series Binary and Time Series Multiclass problem "
... "types require every training and validation split to "
... "have at least one instance of all the target classes. "
... "The following splits are invalid: [1, 3]",
... "data_check_name": "TimeSeriesSplittingDataCheck",
... "level": "error",
... "details": {
... "columns": None, "rows": None,
... "invalid_splits": {
... 1: {"Training": [0, 25]},
... 3: {"Validation": [75, 100]}
... }
... },
... "code": "TIMESERIES_TARGET_NOT_COMPATIBLE_WITH_SPLIT",
... "action_options": []
... }
... ]
"""
messages = []
y = infer_feature_types(y)
invalid_splits = {}
y_unique = y.nunique()
if y is not None:
for split_num, (train, val) in enumerate(self._splitter.split(X=y)):
invalid_dict = {}
train_targets = y[train]
val_targets = y[val]
if train_targets.nunique() < y_unique:
invalid_dict["Training"] = [0, len(train)]
if val_targets.nunique() < y_unique:
invalid_dict["Validation"] = [len(train), len(train) + len(val)]
if invalid_dict:
invalid_splits[(split_num + 1)] = invalid_dict
if invalid_splits:
messages.append(
DataCheckError(
message=f"Time Series Binary and Time Series Multiclass problem types require every training "
f"and validation split to have at least one instance of all the target classes. "
f"The following splits are invalid: {list(invalid_splits)}",
data_check_name=self.name,
message_code=DataCheckMessageCode.TIMESERIES_TARGET_NOT_COMPATIBLE_WITH_SPLIT,
details={
"invalid_splits": invalid_splits,
},
).to_dict(),
)
return messages