Source code for evalml.pipelines.components.transformers.preprocessing.delayed_feature_transformer

"""Transformer that delays input features and target variable for time series problems."""
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from woodwork import logical_types

from evalml.pipelines.components.transformers.transformer import Transformer
from evalml.utils import infer_feature_types


[docs]class DelayedFeatureTransformer(Transformer): """Transformer that delays input features and target variable for time series problems. Args: date_index (str): Name of the column containing the datetime information used to order the data. Ignored. max_delay (int): Maximum number of time units to delay each feature. Defaults to 2. forecast_horizon (int): The number of time periods the pipeline is expected to forecast. delay_features (bool): Whether to delay the input features. Defaults to True. delay_target (bool): Whether to delay the target. Defaults to True. gap (int): The number of time units between when the features are collected and when the target is collected. For example, if you are predicting the next time step's target, gap=1. This is only needed because when gap=0, we need to be sure to start the lagging of the target variable at 1. Defaults to 1. random_seed (int): Seed for the random number generator. This transformer performs the same regardless of the random seed provided. """ name = "Delayed Feature Transformer" hyperparameter_ranges = {} """{}""" needs_fitting = False target_colname_prefix = "target_delay_{}" """target_delay_{}""" def __init__( self, date_index=None, max_delay=2, gap=0, forecast_horizon=1, delay_features=True, delay_target=True, random_seed=0, **kwargs, ): self.date_index = date_index self.max_delay = max_delay self.delay_features = delay_features self.delay_target = delay_target self.forecast_horizon = forecast_horizon self.gap = gap self.start_delay = self.forecast_horizon + self.gap parameters = { "date_index": date_index, "max_delay": max_delay, "delay_target": delay_target, "delay_features": delay_features, "forecast_horizon": forecast_horizon, "gap": gap, } parameters.update(kwargs) super().__init__(parameters=parameters, random_seed=random_seed)
[docs] def fit(self, X, y=None): """Fits the DelayFeatureTransformer. Args: X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] y (pd.Series, optional): The target training data of length [n_samples] Returns: self """ return self
@staticmethod def _encode_y_while_preserving_index(y): y_encoded = LabelEncoder().fit_transform(y) y = pd.Series(y_encoded, index=y.index) return y @staticmethod def _get_categorical_columns(X): return list(X.ww.select(["categorical"], return_schema=True).columns) @staticmethod def _encode_X_while_preserving_index(X_categorical): return pd.DataFrame( OrdinalEncoder().fit_transform(X_categorical), columns=X_categorical.columns, index=X_categorical.index, )
[docs] def transform(self, X, y=None): """Computes the delayed features for all features in X and y. For each feature in X, it will add a column to the output dataframe for each delay in the (inclusive) range [1, max_delay]. The values of each delayed feature are simply the original feature shifted forward in time by the delay amount. For example, a delay of 3 units means that the feature value at row n will be taken from the n-3rd row of that feature If y is not None, it will also compute the delayed values for the target variable. Args: X (pd.DataFrame or None): Data to transform. None is expected when only the target variable is being used. y (pd.Series, or None): Target. Returns: pd.DataFrame: Transformed X. """ if X is None: X = pd.DataFrame() # Normalize the data into pandas objects X_ww = infer_feature_types(X) X_ww = X_ww.ww.copy() categorical_columns = self._get_categorical_columns(X_ww) original_features = list(X_ww.columns) if self.delay_features and len(X) > 0: X_categorical = self._encode_X_while_preserving_index( X_ww[categorical_columns] ) for col_name in X_ww: col = X_ww[col_name] if col_name in categorical_columns: col = X_categorical[col_name] for t in range(self.start_delay, self.start_delay + self.max_delay + 1): X_ww.ww[f"{col_name}_delay_{t}"] = col.shift(t) # Handle cases where the target was passed in if self.delay_target and y is not None: y = infer_feature_types(y) if type(y.ww.logical_type) == logical_types.Categorical: y = self._encode_y_while_preserving_index(y) for t in range(self.start_delay, self.start_delay + self.max_delay + 1): X_ww.ww[self.target_colname_prefix.format(t)] = y.shift(t) return X_ww.ww.drop(original_features)
[docs] def fit_transform(self, X, y): """Fit the component and transform the input data. Args: X (pd.DataFrame or None): Data to transform. None is expected when only the target variable is being used. y (pd.Series, or None): Target. Returns: pd.DataFrame: Transformed X. """ return self.fit(X, y).transform(X, y)