Source code for evalml.pipelines.components.transformers.preprocessing.transform_primitive_components
"""Components that extract features from the input data."""fromabcimportabstractmethodimportfeaturetoolsasftimportwoodworkaswwfromevalml.pipelines.components.transformers.transformerimportTransformerfromevalml.utilsimportinfer_feature_typesclass_ExtractFeaturesWithTransformPrimitives(Transformer):hyperparameter_ranges={}"""{}"""def__init__(self,random_seed=0,**kwargs):self._columns=Noneself._features=Nonesuper().__init__(random_seed=random_seed,**kwargs)@property@classmethod@abstractmethoddef_transform_primitives(cls):"""Return the transform primitives extracted from this component."""@abstractmethoddef_get_columns_to_transform(self,X):"""Return the columns that the primitives will transform."""@abstractmethoddef_get_feature_types_for_featuretools(self,X):"""Get a mapping from column name to the feature tools type. This is needed for dfs. Hopefully, once the ww/ft integration is complete this will be redundant. """def_make_entity_set(self,X):X_to_transform=X[self._columns]X_to_transform.rename(columns=str,inplace=True)ww_logical_types=self._get_feature_types_for_featuretools(X)es=ft.EntitySet()es.add_dataframe(dataframe_name="X",dataframe=X_to_transform,index="index",make_index=True,logical_types=ww_logical_types,)returnesdeffit(self,X,y=None):X=infer_feature_types(X)self._columns=self._get_columns_to_transform(X)iflen(self._columns)==0:returnselfes=self._make_entity_set(X)self._features=ft.dfs(entityset=es,target_dataframe_name="X",trans_primitives=self._transform_primitives,max_depth=1,features_only=True,)returnselfdeftransform(self,X,y=None):X_ww=infer_feature_types(X)ifself._featuresisNoneorlen(self._features)==0:returnX_wwes=self._make_entity_set(X_ww)features=ft.calculate_feature_matrix(features=self._features,entityset=es)ltypes=features.ww.logical_types# CatBoost has an issue with categoricals with string categories:# https://github.com/catboost/catboost/issues/1965# Which will pop up if these categorical features are left with string categories,# so convert them to object until the bug is fixed.features=features.astype(object,copy=False)features.ww.init(logical_types=ltypes)X_ww=X_ww.ww.drop(self._columns)X_ww=ww.concat_columns([X_ww,features])returnX_ww@staticmethoddef_get_primitives_provenance(features):provenance={}forfeatureinfeatures:input_col=feature.base_features[0].get_name()# Return a copy because `get_feature_names` returns a reference to the namesoutput_features=[namefornameinfeature.get_feature_names()]ifinput_colnotinprovenance:provenance[input_col]=output_featureselse:provenance[input_col]+=output_featuresreturnprovenancedef_get_feature_provenance(self):provenance={}ifself._columns:provenance=self._get_primitives_provenance(self._features)returnprovenance
[docs]classEmailFeaturizer(_ExtractFeaturesWithTransformPrimitives):"""Transformer that can automatically extract features from emails. Args: random_seed (int): Seed for the random number generator. Defaults to 0. """name="Email Featurizer"_transform_primitives=[ft.primitives.IsFreeEmailDomain,ft.primitives.EmailAddressToDomain,]def_get_columns_to_transform(self,X):returnlist(X.ww.select("EmailAddress",return_schema=True).columns)def_get_feature_types_for_featuretools(self,X):return{col_name:ww.logical_types.EmailAddress.type_stringforcol_nameinself._columns}
[docs]classURLFeaturizer(_ExtractFeaturesWithTransformPrimitives):"""Transformer that can automatically extract features from URL. Args: random_seed (int): Seed for the random number generator. Defaults to 0. """name="URL Featurizer"_transform_primitives=[ft.primitives.URLToTLD,ft.primitives.URLToDomain,ft.primitives.URLToProtocol,]def_get_columns_to_transform(self,X):returnlist(X.ww.select("URL",return_schema=True).columns)def_get_feature_types_for_featuretools(self,X):return{col_name:ww.logical_types.URL.type_stringforcol_nameinself._columns}