Source code for evalml.pipelines.components.transformers.preprocessing.natural_language_featurizer
"""Transformer that can automatically featurize text columns using featuretools' nlp_primitives."""importstringimportfeaturetoolsasftfromfeaturetools.primitivesimport(DiversityScore,MeanCharactersPerWord,NumCharacters,NumWords,PolarityScore,)fromevalml.pipelines.components.transformers.preprocessingimportLSA,TextTransformerfromevalml.utilsimportinfer_feature_types
[docs]classNaturalLanguageFeaturizer(TextTransformer):"""Transformer that can automatically featurize text columns using featuretools' nlp_primitives. Since models cannot handle non-numeric data, any text must be broken down into features that provide useful information about that text. This component splits each text column into several informative features: Diversity Score, Mean Characters per Word, Polarity Score, LSA (Latent Semantic Analysis), Number of Characters, and Number of Words. Calling transform on this component will replace any text columns in the given dataset with these numeric columns. Args: random_seed (int): Seed for the random number generator. Defaults to 0. """name="Natural Language Featurizer"hyperparameter_ranges={}"""{}"""def__init__(self,random_seed=0,**kwargs):self._trans=[NumWords,NumCharacters,DiversityScore,MeanCharactersPerWord,PolarityScore,]self._features=Noneself._lsa=LSA(random_seed=random_seed)self._primitives_provenance={}super().__init__(random_seed=random_seed,**kwargs)def_clean_text(self,X):"""Remove all non-alphanum chars other than spaces, and make lowercase."""defnormalize(text):text=text.translate(str.maketrans("","",string.punctuation))returntext.lower()forcol_nameinX.columns:# we assume non-str values will have been filtered out prior to calling NaturalLanguageFeaturizer. casting to str is a safeguard.X[col_name].fillna("",inplace=True)col=X[col_name].astype(str)X[col_name]=col.apply(normalize)returnXdef_make_entity_set(self,X,text_columns):X_text=X[text_columns].copy()X_text=self._clean_text(X_text)# featuretools expects str-type column namesX_text.rename(columns=str,inplace=True)all_text_logical_types={col_name:"natural_language"forcol_nameinX_text.columns}es=ft.EntitySet()es.add_dataframe(dataframe_name="X",dataframe=X_text,index="index",make_index=True,logical_types=all_text_logical_types,)returnes
[docs]deffit(self,X,y=None):"""Fits component to data. Args: X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] y (pd.Series): The target training data of length [n_samples] Returns: self """X=infer_feature_types(X)self._text_columns=self._get_text_columns(X)iflen(self._text_columns)==0:returnselfself._lsa.fit(X)es=self._make_entity_set(X,self._text_columns)self._features=ft.dfs(entityset=es,target_dataframe_name="X",trans_primitives=self._trans,max_depth=1,features_only=True,)returnself
@staticmethoddef_get_primitives_provenance(features):provenance={}forfeatureinfeatures:input_col=feature.base_features[0].get_name()# Return a copy because `get_feature_names` returns a reference to the namesoutput_features=[namefornameinfeature.get_feature_names()]ifinput_colnotinprovenance:provenance[input_col]=output_featureselse:provenance[input_col]+=output_featuresreturnprovenance
[docs]deftransform(self,X,y=None):"""Transforms data X by creating new features using existing text columns. Args: X (pd.DataFrame): The data to transform. y (pd.Series, optional): Ignored. Returns: pd.DataFrame: Transformed X """X_ww=infer_feature_types(X)ifself._featuresisNoneorlen(self._features)==0:returnX_wwes=self._make_entity_set(X_ww,self._text_columns)nan_mask=X[self._text_columns].isna()any_nans=nan_mask.any().any()X_nlp_primitives=ft.calculate_feature_matrix(features=self._features,entityset=es,)ifX_nlp_primitives.isnull().any().any():X_nlp_primitives.fillna(0,inplace=True)X_ww_altered=infer_feature_types(X_ww.ww[self._text_columns].fillna(""),{s:"NaturalLanguage"forsinself._text_columns},)X_lsa=self._lsa.transform(X_ww_altered)X_nlp_primitives.set_index(X_ww.index,inplace=True)ifany_nans:primitive_features=self._get_primitives_provenance(self._features)forcolumn,derived_featuresinprimitive_features.items():X_nlp_primitives.loc[nan_mask[column],derived_features]=Nonelsa_features=self._lsa._get_feature_provenance()forcolumn,derived_featuresinlsa_features.items():X_lsa.loc[nan_mask[column],derived_features]=NoneX_lsa.ww.init(logical_types={col:"Double"forcolinX_lsa.columns})X_nlp_primitives.ww.init(logical_types={col:"Double"forcolinX_nlp_primitives.columns},)X_ww=X_ww.ww.drop(self._text_columns)forcolinX_nlp_primitives:X_ww.ww[col]=X_nlp_primitives[col]forcolinX_lsa:X_ww.ww[col]=X_lsa[col]returnX_ww