Source code for evalml.pipelines.components.transformers.imputers.time_series_imputer
"""Component that imputes missing data according to a specified timeseries-specific imputation strategy."""importpandasaspdimportwoodworkaswwfromwoodwork.logical_typesimportBooleanNullable,Doublefromevalml.pipelines.components.transformersimportTransformerfromevalml.utilsimportinfer_feature_types
[docs]classTimeSeriesImputer(Transformer):"""Imputes missing data according to a specified timeseries-specific imputation strategy. This Transformer should be used after the `TimeSeriesRegularizer` in order to impute the missing values that were added to X and y (if passed). Args: categorical_impute_strategy (string): Impute strategy to use for string, object, boolean, categorical dtypes. Valid values include "backwards_fill" and "forwards_fill". Defaults to "forwards_fill". numeric_impute_strategy (string): Impute strategy to use for numeric columns. Valid values include "backwards_fill", "forwards_fill", and "interpolate". Defaults to "interpolate". target_impute_strategy (string): Impute strategy to use for the target column. Valid values include "backwards_fill", "forwards_fill", and "interpolate". Defaults to "forwards_fill". random_seed (int): Seed for the random number generator. Defaults to 0. Raises: ValueError: If categorical_impute_strategy, numeric_impute_strategy, or target_impute_strategy is not one of the valid values. """modifies_features=Truemodifies_target=Truetraining_only=Truename="Time Series Imputer"hyperparameter_ranges={"categorical_impute_strategy":["backwards_fill","forwards_fill"],"numeric_impute_strategy":["backwards_fill","forwards_fill","interpolate"],"target_impute_strategy":["backwards_fill","forwards_fill","interpolate"],}"""{ "categorical_impute_strategy": ["backwards_fill", "forwards_fill"], "numeric_impute_strategy": ["backwards_fill", "forwards_fill", "interpolate"], "target_impute_strategy": ["backwards_fill", "forwards_fill", "interpolate"], }"""_valid_categorical_impute_strategies=set(["backwards_fill","forwards_fill"])_valid_numeric_impute_strategies=set(["backwards_fill","forwards_fill","interpolate"],)_valid_target_impute_strategies=set(["backwards_fill","forwards_fill","interpolate"],)# Incompatibility: https://github.com/alteryx/evalml/issues/4001# TODO: Remove when support is added https://github.com/alteryx/evalml/issues/4014_integer_nullable_incompatibilities=["X","y"]_boolean_nullable_incompatibilities=["X","y"]def__init__(self,categorical_impute_strategy="forwards_fill",numeric_impute_strategy="interpolate",target_impute_strategy="forwards_fill",random_seed=0,**kwargs,):ifcategorical_impute_strategynotinself._valid_categorical_impute_strategies:raiseValueError(f"{categorical_impute_strategy} is an invalid parameter. Valid categorical impute strategies are {', '.join(self._valid_numeric_impute_strategies)}",)elifnumeric_impute_strategynotinself._valid_numeric_impute_strategies:raiseValueError(f"{numeric_impute_strategy} is an invalid parameter. Valid numeric impute strategies are {', '.join(self._valid_numeric_impute_strategies)}",)eliftarget_impute_strategynotinself._valid_target_impute_strategies:raiseValueError(f"{target_impute_strategy} is an invalid parameter. Valid target column impute strategies are {', '.join(self._valid_target_impute_strategies)}",)parameters={"categorical_impute_strategy":categorical_impute_strategy,"numeric_impute_strategy":numeric_impute_strategy,"target_impute_strategy":target_impute_strategy,}parameters.update(kwargs)self._all_null_cols=Noneself._forwards_cols=Noneself._backwards_cols=Noneself._interpolate_cols=Noneself._impute_target=Nonesuper().__init__(parameters=parameters,component_obj=None,random_seed=random_seed,)
[docs]deffit(self,X,y=None):"""Fits imputer to data. 'None' values are converted to np.nan before imputation and are treated as the same. If a value is missing at the beginning or end of a column, that value will be imputed using backwards fill or forwards fill as necessary, respectively. Args: X (pd.DataFrame, np.ndarray): The input training data of shape [n_samples, n_features] y (pd.Series, optional): The target training data of length [n_samples] Returns: self """X=infer_feature_types(X)nan_ratio=X.ww.describe().loc["nan_count"]/X.shape[0]self._all_null_cols=nan_ratio[nan_ratio==1].index.tolist()def_filter_cols(impute_strat,X):"""Function to return which columns of the dataset to impute given the impute strategy."""cols=[]ifself.parameters["categorical_impute_strategy"]==impute_strat:ifself.parameters["numeric_impute_strategy"]==impute_strat:cols=list(X.columns)else:cols=list(X.ww.select(exclude=["numeric"]).columns)elifself.parameters["numeric_impute_strategy"]==impute_strat:cols=list(X.ww.select(include=["numeric"]).columns)X_cols=[colforcolincolsifcolnotinself._all_null_cols]iflen(X_cols)>0:returnX_colsself._forwards_cols=_filter_cols("forwards_fill",X)self._backwards_cols=_filter_cols("backwards_fill",X)self._interpolate_cols=_filter_cols("interpolate",X)ifyisnotNone:y=infer_feature_types(y)ify.isnull().any():self._impute_target=self.parameters["target_impute_strategy"]returnself
[docs]deftransform(self,X,y=None):"""Transforms data X by imputing missing values using specified timeseries-specific strategies. 'None' values are converted to np.nan before imputation and are treated as the same. Args: X (pd.DataFrame): Data to transform. y (pd.Series, optional): Optionally, target data to transform. Returns: pd.DataFrame: Transformed X and y """iflen(self._all_null_cols)==X.shape[1]:df=pd.DataFrame(index=X.index)df.ww.init()returndf,yX=infer_feature_types(X)ifyisnotNone:y=infer_feature_types(y)X_not_all_null=X.ww.drop(self._all_null_cols)X_schema=X_not_all_null.ww.schemaX_schema=X_schema.get_subset_schema(subset_cols=X_schema._filter_cols(exclude=["IntegerNullable","BooleanNullable","AgeNullable"],),)ifself._forwards_colsisnotNone:X_forward=X.ww[self._forwards_cols]imputed=X_forward.pad()imputed.bfill(inplace=True)# Fill in the first value, if missingX_not_all_null[X_forward.columns]=imputedifself._backwards_colsisnotNone:X_backward=X.ww[self._backwards_cols]imputed=X_backward.bfill()imputed.pad(inplace=True)# Fill in the last value, if missingX_not_all_null[X_backward.columns]=imputedifself._interpolate_colsisnotNone:X_interpolate=X.ww[self._interpolate_cols]# TODO: Revert when pandas introduces Float64 dtypeimputed=X_interpolate.astype(float,).interpolate()# Cast to float because Int64 not handledimputed.bfill(inplace=True)# Fill in the first value, if missingX_not_all_null[X_interpolate.columns]=imputedX_not_all_null.ww.init(schema=X_schema)y_imputed=pd.Series(y)ifyisnotNoneandlen(y)>0:ifself._impute_target=="forwards_fill":y_imputed=y.pad()y_imputed.bfill(inplace=True)elifself._impute_target=="backwards_fill":y_imputed=y.bfill()y_imputed.pad(inplace=True)elifself._impute_target=="interpolate":# TODO: Revert when pandas introduces Float64 dtypey_imputed=y.astype(float).interpolate()y_imputed.bfill(inplace=True)y_imputed=ww.init_series(y_imputed)returnX_not_all_null,y_imputed
def_handle_nullable_types(self,X=None,y=None):"""Transforms X and y to remove any incompatible nullable types for the time series imputer when the interpolate method is used. Args: X (pd.DataFrame, optional): Input data to a component of shape [n_samples, n_features]. May contain nullable types. y (pd.Series, optional): The target of length [n_samples]. May contain nullable types. Returns: X, y with any incompatible nullable types downcasted to compatible equivalents when interpolate is used. Is NoOp otherwise. """ifself._impute_target=="interpolate":# For BooleanNullable, we have to avoid Categorical columns# since the category dtype also has incompatibilities with linear interpolate, which is expectedifisinstance(y.ww.logical_type,BooleanNullable):y=ww.init_series(y,Double)else:_,y=super()._handle_nullable_types(None,y)ifself._interpolate_colsisnotNone:X,_=super()._handle_nullable_types(X,None)returnX,y