Source code for evalml.pipelines.components.transformers.preprocessing.datetime_featurizer
"""Transformer that can automatically extract features from datetime columns."""importnumpyasnpimportpandasaspdimportwoodworkaswwfromfeaturetools.primitivesimportHour,Month,Weekday,Yearfromevalml.pipelines.components.transformersimportTransformerfromevalml.utilsimportinfer_feature_typesdef_extract_year(col,encode_as_categories=False):returnYear()(col),None_int_to_month_mapping={0:"January",1:"February",2:"March",3:"April",4:"May",5:"June",6:"July",7:"August",8:"September",9:"October",10:"November",11:"December",}def_extract_month(col,encode_as_categories=False):month=Month()months=month(col)-1months_unique=pd.Series(months.unique())ifencode_as_categories:months=ww.init_series(months,logical_type="Categorical")returnmonths,{_int_to_month_mapping.get(m,np.nan):mforminmonths_unique}_day_to_int_mapping={"Sunday":0,"Monday":1,"Tuesday":2,"Wednesday":3,"Thursday":4,"Friday":5,"Saturday":6,}_int_to_day_mapping={0:"Sunday",1:"Monday",2:"Tuesday",3:"Wednesday",4:"Thursday",5:"Friday",6:"Saturday",}def_extract_day_of_week(col,encode_as_categories=False):wd=Weekday()days=wd(col)+1days=days.replace(7,0)days_unique=days.unique()ifencode_as_categories:days=ww.init_series(days,logical_type="Categorical")returndays,{_int_to_day_mapping.get(d,np.nan):dfordindays_unique}def_extract_hour(col,encode_as_categories=False):returnHour()(col),None
[docs]classDateTimeFeaturizer(Transformer):"""Transformer that can automatically extract features from datetime columns. Args: features_to_extract (list): List of features to extract. Valid options include "year", "month", "day_of_week", "hour". Defaults to None. encode_as_categories (bool): Whether day-of-week and month features should be encoded as pandas "category" dtype. This allows OneHotEncoders to encode these features. Defaults to False. time_index (str): Name of the column containing the datetime information used to order the data. Ignored. random_seed (int): Seed for the random number generator. Defaults to 0. """name="DateTime Featurizer"hyperparameter_ranges={}"""{}"""_function_mappings={"year":_extract_year,"month":_extract_month,"day_of_week":_extract_day_of_week,"hour":_extract_hour,}def__init__(self,features_to_extract=None,encode_as_categories=False,time_index=None,random_seed=0,**kwargs,):iffeatures_to_extractisNone:features_to_extract=["year","month","day_of_week","hour"]invalid_features=set(features_to_extract)-set(self._function_mappings.keys(),)iflen(invalid_features)>0:raiseValueError("{} are not valid options for features_to_extract".format(", ".join([f"'{feature}'"forfeatureininvalid_features]),),)parameters={"features_to_extract":features_to_extract,"encode_as_categories":encode_as_categories,"time_index":time_index,}parameters.update(kwargs)self._date_time_col_names=Noneself._categories={}self.encode_as_categories=encode_as_categoriessuper().__init__(parameters=parameters,component_obj=None,random_seed=random_seed,)
[docs]deffit(self,X,y=None):"""Fit the datetime featurizer component. Args: X (pd.DataFrame): Input features. y (pd.Series, optional): Target data. Ignored. Returns: self """X=infer_feature_types(X)self._date_time_col_names=list(X.ww.select("datetime",return_schema=True).columns,)returnself
[docs]deftransform(self,X,y=None):"""Transforms data X by creating new features using existing DateTime columns, and then dropping those DateTime columns. Args: X (pd.DataFrame): Input features. y (pd.Series, optional): Ignored. Returns: pd.DataFrame: Transformed X """X=infer_feature_types(X)X=X.ww.copy()features_to_extract=self.parameters["features_to_extract"]iflen(features_to_extract)==0:returnXforcol_nameinself._date_time_col_names:forfeatureinfeatures_to_extract:name=f"{col_name}_{feature}"features,categories=self._function_mappings[feature](X[col_name],self.encode_as_categories,)X.ww[name]=featuresifcategories:self._categories[name]=categoriesX.ww.drop(columns=self._date_time_col_names,inplace=True)returnX
[docs]defget_feature_names(self):"""Gets the categories of each datetime feature. Returns: dict: Dictionary, where each key-value pair is a column name and a dictionary mapping the unique feature values to their integer encoding. """returnself._categories