Source code for evalml.pipelines.components.transformers.imputers.time_series_imputer
"""Component that imputes missing data according to a specified timeseries-specific imputation strategy."""importpandasaspdimportwoodworkaswwfromwoodwork.logical_typesimport(BooleanNullable,Double,)fromevalml.pipelines.components.transformersimportTransformerfromevalml.utilsimportinfer_feature_typesfromevalml.utils.nullable_type_utilsimport(_determine_fractional_type,_determine_non_nullable_equivalent,)
[docs]classTimeSeriesImputer(Transformer):"""Imputes missing data according to a specified timeseries-specific imputation strategy. This Transformer should be used after the `TimeSeriesRegularizer` in order to impute the missing values that were added to X and y (if passed). Args: categorical_impute_strategy (string): Impute strategy to use for string, object, boolean, categorical dtypes. Valid values include "backwards_fill" and "forwards_fill". Defaults to "forwards_fill". numeric_impute_strategy (string): Impute strategy to use for numeric columns. Valid values include "backwards_fill", "forwards_fill", and "interpolate". Defaults to "interpolate". target_impute_strategy (string): Impute strategy to use for the target column. Valid values include "backwards_fill", "forwards_fill", and "interpolate". Defaults to "forwards_fill". random_seed (int): Seed for the random number generator. Defaults to 0. Raises: ValueError: If categorical_impute_strategy, numeric_impute_strategy, or target_impute_strategy is not one of the valid values. """modifies_features=Truemodifies_target=Truetraining_only=Truename="Time Series Imputer"hyperparameter_ranges={"categorical_impute_strategy":["backwards_fill","forwards_fill"],"numeric_impute_strategy":["backwards_fill","forwards_fill","interpolate"],"target_impute_strategy":["backwards_fill","forwards_fill","interpolate"],}"""{ "categorical_impute_strategy": ["backwards_fill", "forwards_fill"], "numeric_impute_strategy": ["backwards_fill", "forwards_fill", "interpolate"], "target_impute_strategy": ["backwards_fill", "forwards_fill", "interpolate"], }"""_valid_categorical_impute_strategies=set(["backwards_fill","forwards_fill"])_valid_numeric_impute_strategies=set(["backwards_fill","forwards_fill","interpolate"],)_valid_target_impute_strategies=set(["backwards_fill","forwards_fill","interpolate"],)# Incompatibility: https://github.com/alteryx/evalml/issues/4001# TODO: Remove when support is added https://github.com/alteryx/evalml/issues/4014_integer_nullable_incompatibilities=["X","y"]_boolean_nullable_incompatibilities=["y"]def__init__(self,categorical_impute_strategy="forwards_fill",numeric_impute_strategy="interpolate",target_impute_strategy="forwards_fill",random_seed=0,**kwargs,):ifcategorical_impute_strategynotinself._valid_categorical_impute_strategies:raiseValueError(f"{categorical_impute_strategy} is an invalid parameter. Valid categorical impute strategies are {', '.join(self._valid_numeric_impute_strategies)}",)elifnumeric_impute_strategynotinself._valid_numeric_impute_strategies:raiseValueError(f"{numeric_impute_strategy} is an invalid parameter. Valid numeric impute strategies are {', '.join(self._valid_numeric_impute_strategies)}",)eliftarget_impute_strategynotinself._valid_target_impute_strategies:raiseValueError(f"{target_impute_strategy} is an invalid parameter. Valid target column impute strategies are {', '.join(self._valid_target_impute_strategies)}",)parameters={"categorical_impute_strategy":categorical_impute_strategy,"numeric_impute_strategy":numeric_impute_strategy,"target_impute_strategy":target_impute_strategy,}parameters.update(kwargs)self._all_null_cols=Noneself._forwards_cols=Noneself._backwards_cols=Noneself._interpolate_cols=Noneself._impute_target=Noneself._y_all_null_cols=Nonesuper().__init__(parameters=parameters,component_obj=None,random_seed=random_seed,)
[docs]deffit(self,X,y=None):"""Fits imputer to data. 'None' values are converted to np.nan before imputation and are treated as the same. If a value is missing at the beginning or end of a column, that value will be imputed using backwards fill or forwards fill as necessary, respectively. Args: X (pd.DataFrame, np.ndarray): The input training data of shape [n_samples, n_features] y (pd.Series, optional): The target training data of length [n_samples] Returns: self """X=infer_feature_types(X)nan_ratio=X.isna().sum()/X.shape[0]self._all_null_cols=nan_ratio[nan_ratio==1].index.tolist()def_filter_cols(impute_strat,X):"""Function to return which columns of the dataset to impute given the impute strategy."""cols=[]ifself.parameters["categorical_impute_strategy"]==impute_strat:ifself.parameters["numeric_impute_strategy"]==impute_strat:cols=list(X.columns)else:cols=list(X.ww.select(exclude=["numeric"]).columns)elifself.parameters["numeric_impute_strategy"]==impute_strat:cols=list(X.ww.select(include=["numeric"]).columns)X_cols=[colforcolincolsifcolnotinself._all_null_cols]iflen(X_cols)>0:returnX_colsself._forwards_cols=_filter_cols("forwards_fill",X)self._backwards_cols=_filter_cols("backwards_fill",X)self._interpolate_cols=_filter_cols("interpolate",X)ifisinstance(y,pd.Series):y=infer_feature_types(y)ify.isnull().any():self._impute_target=self.parameters["target_impute_strategy"]elifisinstance(y,pd.DataFrame):y=infer_feature_types(y)y_nan_ratio=y.isna().sum()/y.shape[0]self._y_all_null_cols=y_nan_ratio[y_nan_ratio==1].index.tolist()ify.isnull().values.any():self._impute_target=self.parameters["target_impute_strategy"]returnself
[docs]deftransform(self,X,y=None):"""Transforms data X by imputing missing values using specified timeseries-specific strategies. 'None' values are converted to np.nan before imputation and are treated as the same. Args: X (pd.DataFrame): Data to transform. y (pd.Series, optional): Optionally, target data to transform. Returns: pd.DataFrame: Transformed X and y """iflen(self._all_null_cols)==X.shape[1]:df=pd.DataFrame(index=X.index)df.ww.init()returndf,yX=infer_feature_types(X)ifyisnotNone:y=infer_feature_types(y)# This will change the logical type of BooleanNullable/IntegerNullable/AgeNullable columns with nans# so we save the original schema to recreate it where possible after imputationoriginal_schema=X.ww.schemaX,y=self._handle_nullable_types(X,y)X_not_all_null=X.ww.drop(self._all_null_cols)# Because the TimeSeriesImputer is always used with the TimeSeriesRegularizer,# many of the columns containing nans may have originally been non nullable logical types.# We will use the non nullable equivalents where possibleoriginal_schema=original_schema.get_subset_schema(list(X_not_all_null.columns),)new_ltypes={col:_determine_non_nullable_equivalent(ltype)forcol,ltypeinoriginal_schema.logical_types.items()}ifself._forwards_colsisnotNone:X_forward=X[self._forwards_cols]imputed=X_forward.pad()imputed.bfill(inplace=True)# Fill in the first value, if missingX_not_all_null[X_forward.columns]=imputedifself._backwards_colsisnotNone:X_backward=X[self._backwards_cols]imputed=X_backward.bfill()imputed.pad(inplace=True)# Fill in the last value, if missingX_not_all_null[X_backward.columns]=imputedifself._interpolate_colsisnotNone:X_interpolate=X_not_all_null[self._interpolate_cols]imputed=X_interpolate.interpolate()imputed.bfill(inplace=True)# Fill in the first value, if missingX_not_all_null[X_interpolate.columns]=imputed# Interpolate may add floating point values to integer data, so we# have to update those logical types from the ones passed in to a fractional type# Note we ignore all other types of columns to maintain the types specified aboveint_cols_to_update=original_schema._filter_cols(include=["IntegerNullable","AgeNullable"],)new_int_ltypes={col:_determine_fractional_type(ltype)forcol,ltypeinoriginal_schema.logical_types.items()ifcolinint_cols_to_update}new_ltypes.update(new_int_ltypes)X_not_all_null.ww.init(schema=original_schema,logical_types=new_ltypes)y_imputed=(y.ww.drop(self._y_all_null_cols)ifisinstance(y,pd.DataFrame)elsepd.Series(y))ifyisnotNoneandlen(y)>0:ifself._impute_target=="forwards_fill":y_imputed=y_imputed.pad()y_imputed.bfill(inplace=True)elifself._impute_target=="backwards_fill":y_imputed=y_imputed.bfill()y_imputed.pad(inplace=True)elifself._impute_target=="interpolate":y_imputed=y_imputed.interpolate()y_imputed.bfill(inplace=True)# Re-initialize woodwork with the downcast logical typeifisinstance(y,pd.Series):y_imputed=ww.init_series(y_imputed,logical_type=y.ww.logical_type)else:y_original_schema=y.ww.schema.get_subset_schema(list(y_imputed.columns),)y_new_ltypes={col:_determine_non_nullable_equivalent(ltype)forcol,ltypeiny_original_schema.logical_types.items()}y_imputed.ww.init(schema=y_original_schema,logical_types=y_new_ltypes)returnX_not_all_null,y_imputed
def_handle_nullable_types(self,X=None,y=None):"""Transforms X and y to remove any incompatible nullable types for the time series imputer when the interpolate method is used. Args: X (pd.DataFrame, optional): Input data to a component of shape [n_samples, n_features]. May contain nullable types. y (pd.Series or pd.DataFrame, optional): The target of length [n_samples] or the unstacked target for a multiseries problem of length [n_samples, n_features*n_series]. May contain nullable types. Returns: X, y with any incompatible nullable types downcasted to compatible equivalents when interpolate is used. Is NoOp otherwise. """ifself._impute_target=="interpolate":# For BooleanNullable, we have to avoid Categorical columns# since the category dtype also has incompatibilities with linear interpolate, which is expected# TODO: Avoid categorical columns for BooleanNullable in multiseries when# multiseries timeseries supports categoricalifisinstance(y,pd.Series)andisinstance(y.ww.logical_type,BooleanNullable,):y=ww.init_series(y,Double)else:_,y=super()._handle_nullable_types(None,y)ifself._interpolate_colsisnotNone:X,_=super()._handle_nullable_types(X,None)returnX,y