"""Pipeline base class for time-series classification problems."""
import numpy as np
import pandas as pd
import woodwork as ww
from .binary_classification_pipeline_mixin import (
BinaryClassificationPipelineMixin,
)
from evalml.objectives import get_objective
from evalml.pipelines.classification_pipeline import ClassificationPipeline
from evalml.pipelines.time_series_pipeline_base import TimeSeriesPipelineBase
from evalml.problem_types import ProblemTypes
from evalml.utils import infer_feature_types
[docs]class TimeSeriesClassificationPipeline(TimeSeriesPipelineBase, ClassificationPipeline):
"""Pipeline base class for time series classification problems.
Args:
component_graph (ComponentGraph, list, dict): ComponentGraph instance, list of components in order, or dictionary of components.
Accepts strings or ComponentBase subclasses in the list.
Note that when duplicate components are specified in a list, the duplicate component names will be modified with the
component's index in the list. For example, the component graph
[Imputer, One Hot Encoder, Imputer, Logistic Regression Classifier] will have names
["Imputer", "One Hot Encoder", "Imputer_2", "Logistic Regression Classifier"]
parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values.
An empty dictionary {} implies using all default values for component parameters. Pipeline-level
parameters such as date_index, gap, and max_delay must be specified with the "pipeline" key. For example:
Pipeline(parameters={"pipeline": {"date_index": "Date", "max_delay": 4, "gap": 2}}).
random_seed (int): Seed for the random number generator. Defaults to 0.
"""
[docs] def fit(self, X, y):
"""Fit a time series classification pipeline.
Args:
X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features].
y (pd.Series, np.ndarray): The target training targets of length [n_samples].
Returns:
self
"""
X, y = self._convert_to_woodwork(X, y)
self._fit(X, y)
self._classes_ = list(ww.init_series(np.unique(y)))
return self
def _estimator_predict_proba(self, features, y):
"""Get estimator predicted probabilities.
This helper passes y as an argument if needed by the estimator.
"""
y_arg = None
if self.estimator.predict_uses_y:
y_arg = y
return self.estimator.predict_proba(features, y=y_arg)
[docs] def predict_proba_in_sample(self, X_holdout, y_holdout, X_train, y_train):
"""Predict on future data where the target is known, e.g. cross validation.
Args:
X_holdout (pd.DataFrame or np.ndarray): Future data of shape [n_samples, n_features].
y_holdout (pd.Series, np.ndarray): Future target of shape [n_samples].
X_train (pd.DataFrame, np.ndarray): Data the pipeline was trained on of shape [n_samples_train, n_features].
y_train (pd.Series, np.ndarray): Targets used to train the pipeline of shape [n_samples_train].
Returns:
pd.Series: Estimated probabilities.
Raises:
ValueError: If the final component is not an Estimator.
"""
if self.estimator is None:
raise ValueError(
"Cannot call predict_proba_in_sample() on a component graph because the final component is not an Estimator."
)
features = self.transform_all_but_final(X_holdout, y_holdout, X_train, y_train)
proba = self._estimator_predict_proba(features, y_holdout)
proba.index = y_holdout.index
proba = proba.ww.rename(
columns={col: new_col for col, new_col in zip(proba.columns, self.classes_)}
)
return infer_feature_types(proba)
[docs] def predict_in_sample(self, X, y, X_train, y_train, objective=None):
"""Predict on future data where the target is known, e.g. cross validation.
Note: we cast y as ints first to address boolean values that may be returned from
calculating predictions which we would not be able to otherwise transform if we
originally had integer targets.
Args:
X (pd.DataFrame or np.ndarray): Future data of shape [n_samples, n_features].
y (pd.Series, np.ndarray): Future target of shape [n_samples].
X_train (pd.DataFrame, np.ndarray): Data the pipeline was trained on of shape [n_samples_train, n_features].
y_train (pd.Series, np.ndarray): Targets used to train the pipeline of shape [n_samples_train].
objective (ObjectiveBase, str, None): Objective used to threshold predicted probabilities, optional.
Returns:
pd.Series: Estimated labels.
Raises:
ValueError: If final component is not an Estimator.
"""
if self.estimator is None:
raise ValueError(
"Cannot call predict_in_sample() on a component graph because the final component is not an Estimator."
)
features = self.transform_all_but_final(X, y, X_train, y_train)
predictions = self._estimator_predict(features, y)
predictions.index = y.index
predictions = self.inverse_transform(predictions.astype(int))
predictions = pd.Series(predictions, name=self.input_target_name)
predictions = predictions.rename(index=dict(zip(predictions.index, y.index)))
return infer_feature_types(predictions)
[docs] def predict_proba(self, X, X_train=None, y_train=None):
"""Predict on future data where the target is unknown.
Args:
X (pd.DataFrame or np.ndarray): Future data of shape [n_samples, n_features].
X_train (pd.DataFrame, np.ndarray): Data the pipeline was trained on of shape [n_samples_train, n_features].
y_train (pd.Series, np.ndarray): Targets used to train the pipeline of shape [n_samples_train].
Returns:
pd.Series: Estimated probabilities.
Raises:
ValueError: If final component is not an Estimator.
"""
if self.estimator is None:
raise ValueError(
"Cannot call predict_proba() on a component graph because the final component is not an Estimator."
)
X_train, y_train = self._convert_to_woodwork(X_train, y_train)
X = infer_feature_types(X)
self._validate_holdout_datasets(X, X_train)
y_holdout = self._create_empty_series(y_train)
y_holdout = infer_feature_types(y_holdout)
y_holdout.index = X.index
return self.predict_proba_in_sample(X, y_holdout, X_train, y_train)
def _compute_predictions(self, X, y, X_train, y_train, objectives):
y_predicted = None
y_predicted_proba = None
if any(o.score_needs_proba for o in objectives):
y_predicted_proba = self.predict_proba_in_sample(X, y, X_train, y_train)
if any(not o.score_needs_proba for o in objectives):
y_predicted = self.predict_in_sample(X, y, X_train, y_train)
y_predicted = self._encode_targets(y_predicted)
return y_predicted, y_predicted_proba
[docs] def score(self, X, y, objectives, X_train=None, y_train=None):
"""Evaluate model performance on current and additional objectives.
Args:
X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features].
y (pd.Series): True labels of length [n_samples].
objectives (list): Non-empty list of objectives to score on.
X_train (pd.DataFrame, np.ndarray): Data the pipeline was trained on of shape [n_samples_train, n_features].
y_train (pd.Series, np.ndarray): Targets used to train the pipeline of shape [n_samples_train].
Returns:
dict: Ordered dictionary of objective scores.
"""
X, y = self._convert_to_woodwork(X, y)
X_train, y_train = self._convert_to_woodwork(X_train, y_train)
objectives = self.create_objectives(objectives)
y_predicted, y_predicted_proba = self._compute_predictions(
X,
y,
X_train,
y_train,
objectives,
)
if self._encoder is not None:
y = self._encode_targets(y)
return self._score_all_objectives(
X,
y,
y_predicted,
y_pred_proba=y_predicted_proba,
objectives=objectives,
)
[docs]class TimeSeriesBinaryClassificationPipeline(
TimeSeriesClassificationPipeline,
BinaryClassificationPipelineMixin,
):
"""Pipeline base class for time series binary classification problems.
Args:
component_graph (list or dict): List of components in order. Accepts strings or ComponentBase subclasses in the list.
Note that when duplicate components are specified in a list, the duplicate component names will be modified with the
component's index in the list. For example, the component graph
[Imputer, One Hot Encoder, Imputer, Logistic Regression Classifier] will have names
["Imputer", "One Hot Encoder", "Imputer_2", "Logistic Regression Classifier"]
parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values.
An empty dictionary {} implies using all default values for component parameters. Pipeline-level
parameters such as date_index, gap, and max_delay must be specified with the "pipeline" key. For example:
Pipeline(parameters={"pipeline": {"date_index": "Date", "max_delay": 4, "gap": 2}}).
random_seed (int): Seed for the random number generator. Defaults to 0.
Example:
>>> pipeline = TimeSeriesBinaryClassificationPipeline(component_graph=["Simple Imputer", "Logistic Regression Classifier"],
... parameters={"Logistic Regression Classifier": {"penalty": "elasticnet",
... "solver": "liblinear"},
... "pipeline": {"gap": 1, "max_delay": 1, "forecast_horizon": 1, "date_index": None}},
... custom_name="My TimeSeriesBinary Pipeline")
...
>>> assert pipeline.custom_name == "My TimeSeriesBinary Pipeline"
>>> assert pipeline.component_graph.component_dict.keys() == {'Simple Imputer', 'Logistic Regression Classifier'}
...
>>> assert pipeline.parameters == {
... 'Simple Imputer': {'impute_strategy': 'most_frequent', 'fill_value': None},
... 'Logistic Regression Classifier': {'penalty': 'elasticnet',
... 'C': 1.0,
... 'n_jobs': -1,
... 'multi_class': 'auto',
... 'solver': 'liblinear'},
... 'pipeline': {'gap': 1, 'max_delay': 1, 'forecast_horizon': 1, 'date_index': None}}
"""
problem_type = ProblemTypes.TIME_SERIES_BINARY
def _select_y_pred_for_score(self, X, y, y_pred, y_pred_proba, objective):
y_pred_to_use = y_pred
if self.threshold is not None and not objective.score_needs_proba:
y_pred_to_use = self._predict_with_objective(X, y_pred_proba, objective)
return y_pred_to_use
[docs] def predict_in_sample(self, X, y, X_train, y_train, objective=None):
"""Predict on future data where the target is known, e.g. cross validation.
Args:
X (pd.DataFrame): Future data of shape [n_samples, n_features].
y (pd.Series): Future target of shape [n_samples].
X_train (pd.DataFrame): Data the pipeline was trained on of shape [n_samples_train, n_feautures].
y_train (pd.Series): Targets used to train the pipeline of shape [n_samples_train].
objective (ObjectiveBase, str): Objective used to threshold predicted probabilities, optional. Defaults to None.
Returns:
pd.Series: Estimated labels.
Raises:
ValueError: If objective is not defined for time-series binary classification problems.
"""
if objective is not None:
objective = get_objective(objective, return_instance=True)
if not objective.is_defined_for_problem_type(self.problem_type):
raise ValueError(
f"Objective {objective.name} is not defined for time series binary classification."
)
if self.threshold is not None:
proba = self.predict_proba_in_sample(X, y, X_train, y_train)
proba = proba.iloc[:, 1]
if objective is None:
predictions = proba > self.threshold
else:
predictions = objective.decision_function(
proba, threshold=self.threshold, X=X
)
predictions = pd.Series(
predictions,
name=self.input_target_name,
index=y.index,
)
else:
predictions = super().predict_in_sample(X, y, X_train, y_train)
return infer_feature_types(predictions)
@staticmethod
def _score(X, y, predictions, objective):
"""Given data, model predictions or predicted probabilities computed on the data, and an objective, evaluate and return the objective score."""
if predictions.ndim > 1:
predictions = predictions.iloc[:, 1]
return TimeSeriesClassificationPipeline._score(X, y, predictions, objective)
[docs]class TimeSeriesMulticlassClassificationPipeline(TimeSeriesClassificationPipeline):
"""Pipeline base class for time series multiclass classification problems.
Args:
component_graph (list or dict): List of components in order. Accepts strings or ComponentBase subclasses in the list.
Note that when duplicate components are specified in a list, the duplicate component names will be modified with the
component's index in the list. For example, the component graph
[Imputer, One Hot Encoder, Imputer, Logistic Regression Classifier] will have names
["Imputer", "One Hot Encoder", "Imputer_2", "Logistic Regression Classifier"]
parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values.
An empty dictionary {} implies using all default values for component parameters. Pipeline-level
parameters such as date_index, gap, and max_delay must be specified with the "pipeline" key. For example:
Pipeline(parameters={"pipeline": {"date_index": "Date", "max_delay": 4, "gap": 2}}).
random_seed (int): Seed for the random number generator. Defaults to 0.
Example:
>>> pipeline = TimeSeriesMulticlassClassificationPipeline(component_graph=["Simple Imputer", "Logistic Regression Classifier"],
... parameters={"Logistic Regression Classifier": {"penalty": "elasticnet",
... "solver": "liblinear"},
... "pipeline": {"gap": 1, "max_delay": 1, "forecast_horizon": 1, "date_index": None}},
... custom_name="My TimeSeriesMulticlass Pipeline")
>>> assert pipeline.custom_name == "My TimeSeriesMulticlass Pipeline"
>>> assert pipeline.component_graph.component_dict.keys() == {'Simple Imputer', 'Logistic Regression Classifier'}
>>> assert pipeline.parameters == {
... 'Simple Imputer': {'impute_strategy': 'most_frequent', 'fill_value': None},
... 'Logistic Regression Classifier': {'penalty': 'elasticnet',
... 'C': 1.0,
... 'n_jobs': -1,
... 'multi_class': 'auto',
... 'solver': 'liblinear'},
... 'pipeline': {'gap': 1, 'max_delay': 1, 'forecast_horizon': 1, 'date_index': None}}
"""
problem_type = ProblemTypes.TIME_SERIES_MULTICLASS
"""ProblemTypes.TIME_SERIES_MULTICLASS"""