Source code for evalml.pipelines.components.transformers.preprocessing.time_series_regularizer
"""Transformer that regularizes a dataset with an uninferrable offset frequency for time series problems."""importpandasaspdfromwoodwork.logical_typesimportDatetimefromwoodwork.statistics_utilsimportinfer_frequencyfromevalml.pipelines.components.transformers.transformerimportTransformerfromevalml.utilsimportinfer_feature_types
[docs]classTimeSeriesRegularizer(Transformer):"""Transformer that regularizes an inconsistently spaced datetime column. If X is passed in to fit/transform, the column `time_index` will be checked for an inferrable offset frequency. If the `time_index` column is perfectly inferrable then this Transformer will do nothing and return the original X and y. If X does not have a perfectly inferrable frequency but one can be estimated, then X and y will be reformatted based on the estimated frequency for `time_index`. In the original X and y passed: - Missing datetime values will be added and will have their corresponding columns in X and y set to None. - Duplicate datetime values will be dropped. - Extra datetime values will be dropped. - If it can be determined that a duplicate or extra value is misaligned, then it will be repositioned to take the place of a missing value. This Transformer should be used before the `TimeSeriesImputer` in order to impute the missing values that were added to X and y (if passed). If used on multiseries dataset, works specifically on unstacked datasets. Args: time_index (string): Name of the column containing the datetime information used to order the data, required. Defaults to None. frequency_payload (tuple): Payload returned from Woodwork's infer_frequency function where debug is True. Defaults to None. window_length (int): The size of the rolling window over which inference is conducted to determine the prevalence of uninferrable frequencies. Lower values make this component more sensitive to recognizing numerous faulty datetime values. Defaults to 5. threshold (float): The minimum percentage of windows that need to have been able to infer a frequency. Lower values make this component more sensitive to recognizing numerous faulty datetime values. Defaults to 0.8. random_seed (int): Seed for the random number generator. This transformer performs the same regardless of the random seed provided. Defaults to 0. Raises: ValueError: if the frequency_payload parameter has not been passed a tuple """name="Time Series Regularizer"hyperparameter_ranges={}"""{}"""modifies_target=Truetraining_only=Truedef__init__(self,time_index=None,frequency_payload=None,window_length=4,threshold=0.4,random_seed=0,**kwargs,):self.time_index=time_indexself.frequency_payload=frequency_payloadself.window_length=window_lengthself.threshold=thresholdself.error_dict={}self.inferred_freq=Noneself.debug_payload=Noneifself.frequency_payloadandnotisinstance(self.frequency_payload,tuple):raiseValueError("The frequency_payload parameter must be a tuple returned from Woodwork's infer_frequency function where debug is True.",)parameters={"time_index":time_index,"window_length":window_length,"threshold":threshold,}parameters.update(kwargs)super().__init__(parameters=parameters,random_seed=random_seed)
[docs]deffit(self,X,y=None):"""Fits the TimeSeriesRegularizer. Args: X (pd.DataFrame): The input training data of shape [n_samples, n_features]. y (pd.Series, optional): The target training data of length [n_samples]. Returns: self Raises: ValueError: if self.time_index is None, if X and y have different lengths, if `time_index` in X does not have an offset frequency that can be estimated TypeError: if the `time_index` column is not of type Datetime KeyError: if the `time_index` column doesn't exist """ifself.time_indexisNone:raiseValueError("The argument time_index cannot be None!")elifself.time_indexnotinX.columns:raiseKeyError(f"The time_index column `{self.time_index}` does not exist in X!",)X_ww=infer_feature_types(X)ifnotisinstance(X_ww.ww.logical_types[self.time_index],Datetime):raiseTypeError(f"The time_index column `{self.time_index}` must be of type Datetime.",)ifyisnotNone:y=infer_feature_types(y)iflen(X_ww)!=len(y):raiseValueError("If y has been passed, then it must be the same length as X.",)ifself.frequency_payload:ww_payload=self.frequency_payloadelse:ww_payload=infer_frequency(X_ww[self.time_index],debug=True,window_length=self.window_length,threshold=self.threshold,)self.inferred_freq=ww_payload[0]self.debug_payload=ww_payload[1]ifself.inferred_freqisnotNone:returnselfif(self.debug_payload["estimated_freq"]isNone):# If even WW can't infer the frequencyraiseValueError(f"The column {self.time_index} does not have a frequency that can be inferred.",)estimated_freq=self.debug_payload["estimated_freq"]duplicates=self.debug_payload["duplicate_values"]missing=self.debug_payload["missing_values"]extra=self.debug_payload["extra_values"]nan=self.debug_payload["nan_values"]self.error_dict=self._identify_indices(self.time_index,X_ww,estimated_freq,duplicates,missing,extra,nan,)returnself
@staticmethoddef_identify_indices(time_index,X,estimated_freq,duplicates,missing,extra,nan,):"""Identifies which of the problematic indices is actually misaligned. Args: time_index (str): The column name of the datetime values to consider. X (pd.DataFrame): The input training data of shape [n_samples, n_features]. estimated_freq (str): The estimated frequency of the `time_index` column. duplicates (list): Payload information regarding the duplicate values. missing (list): Payload information regarding the missing values. extra (list): Payload information regarding the extra values. nan (list): Payload information regarding the nan values. Returns: (dict): A dictionary of the duplicate, missing, extra, and misaligned indices and their datetime values. """error_dict={"duplicate":{},"missing":{},"extra":{},"nan":{},"misaligned":{},}# Adds the indices for the consecutive range of missing, duplicate, and extra valuesforeach_missinginmissing:# Needed to recreate what the missing datetime values would have beentemp_dates=pd.date_range(pd.to_datetime(each_missing["dt"]),freq=estimated_freq,periods=each_missing["range"],)foreach_rangeinrange(each_missing["range"]):error_dict["missing"][each_missing["idx"]+each_range]=temp_dates[each_range]foreach_duplicateinduplicates:foreach_rangeinrange(each_duplicate["range"]):error_dict["duplicate"][each_duplicate["idx"]+each_range]=(pd.to_datetime(each_duplicate["dt"]))foreach_extrainextra:foreach_rangeinrange(each_extra["range"]):error_dict["extra"][each_extra["idx"]+each_range]=X.iloc[each_extra["idx"]+each_range][time_index]foreach_naninnan:foreach_rangeinrange(each_nan["range"]):error_dict["nan"][each_nan["idx"]+each_range]="No Value"# Identify which of the duplicate/extra values in conjunction with the missing values are actually misalignedforind_missing,missing_valueinerror_dict["missing"].items():temp_range=pd.date_range(missing_value,freq=estimated_freq,periods=3)window_range=temp_range[1]-temp_range[0]missing_range=[missing_value-window_range,missing_value+window_range]forind_duplicate,duplicate_valueinerror_dict["duplicate"].items():if(duplicate_valueisnotNoneandmissing_range[0]<=duplicate_value<=missing_range[1]):error_dict["misaligned"][ind_duplicate]={"incorrect":duplicate_value,"correct":missing_value,}error_dict["duplicate"][ind_duplicate]=Noneerror_dict["missing"][ind_missing]=Nonebreakforind_extra,extra_valueinerror_dict["extra"].items():if(extra_valueisnotNoneandmissing_range[0]<=extra_value<=missing_range[1]):error_dict["misaligned"][ind_extra]={"incorrect":extra_value,"correct":missing_value,}error_dict["extra"][ind_extra]=Noneerror_dict["missing"][ind_missing]=Nonebreakfinal_error_dict={"duplicate":{},"missing":{},"extra":{},"nan":{},"misaligned":{},}# Remove duplicate/extra/missing values that were identified as misalignedfortype_,type_indsinerror_dict.items():new_type_inds={ind_:date_forind_,date_intype_inds.items()ifdate_isnotNone}final_error_dict[type_]=new_type_indsreturnfinal_error_dict
[docs]deftransform(self,X,y=None):"""Regularizes a dataframe and target data to an inferrable offset frequency. A 'clean' X and y (if y was passed in) are created based on an inferrable offset frequency and matching datetime values with the original X and y are imputed into the clean X and y. Datetime values identified as misaligned are shifted into their appropriate position. Args: X (pd.DataFrame): The input training data of shape [n_samples, n_features]. y (pd.Series, optional): The target training data of length [n_samples]. Returns: (pd.DataFrame, pd.Series): Data with an inferrable `time_index` offset frequency. """ifself.inferred_freqisnotNone:returnX,y# The cleaned df will begin at the range determined by estimated_range_start, which will result# in dropping of the first consecutive faulty values in the dataset.cleaned_df=pd.DataFrame({self.time_index:pd.date_range(self.debug_payload["estimated_range_start"],self.debug_payload["estimated_range_end"],freq=self.debug_payload["estimated_freq"],),},)cleaned_x=cleaned_df.merge(X,on=[self.time_index],how="left")cleaned_x=cleaned_x.groupby(self.time_index).first().reset_index()cleaned_y=NoneifyisnotNone:ifisinstance(y,pd.Series):y_dates=pd.DataFrame({self.time_index:X[self.time_index],"target":y},)else:y_dates=yy_dates[self.time_index]=X[self.time_index]cleaned_y=cleaned_df.merge(y_dates,on=[self.time_index],how="left")cleaned_y=cleaned_y.groupby(self.time_index).first().reset_index()forindex,valuesinself.error_dict["misaligned"].items():to_replace=X.iloc[index]to_replace[self.time_index]=values["correct"]cleaned_x.loc[cleaned_x[self.time_index]==values["correct"]]=(to_replace.values)ifyisnotNoneandisinstance(y,pd.Series):cleaned_y.loc[cleaned_y[self.time_index]==values["correct"]]=y.iloc[index]ifcleaned_yisnotNone:ifisinstance(y,pd.Series):cleaned_y=cleaned_y["target"]elifisinstance(y,pd.DataFrame):# remove date time column from unstacked ycleaned_y=cleaned_y.drop(columns=self.time_index,axis=1)cleaned_y.ww.init()cleaned_x.ww.init()returncleaned_x,cleaned_y