Source code for evalml.pipelines.components.transformers.preprocessing.lsa
"""Transformer to calculate the Latent Semantic Analysis Values of text input."""importpandasaspdfromsklearn.decompositionimportTruncatedSVDfromsklearn.feature_extraction.textimportTfidfVectorizerfromsklearn.pipelineimportmake_pipelinefromevalml.pipelines.components.transformers.preprocessingimportTextTransformerfromevalml.utilsimportinfer_feature_types
[docs]classLSA(TextTransformer):"""Transformer to calculate the Latent Semantic Analysis Values of text input. Args: random_seed (int): Seed for the random number generator. Defaults to 0. """name="LSA Transformer"hyperparameter_ranges={}"""{}"""def__init__(self,random_seed=0,**kwargs):self._lsa_pipeline=make_pipeline(TfidfVectorizer(),TruncatedSVD(random_state=random_seed),)self._provenance={}super().__init__(random_seed=random_seed,**kwargs)
[docs]deffit(self,X,y=None):"""Fits the input data. Args: X (pd.DataFrame): The data to transform. y (pd.Series, optional): Ignored. Returns: self """X=infer_feature_types(X)self._text_columns=self._get_text_columns(X)iflen(self._text_columns)==0:returnselfcorpus=X[self._text_columns].values.flatten()# we assume non-str values will have been filtered out prior to calling LSA.fit. this is a safeguard.corpus=corpus.astype(str)self._lsa_pipeline.fit(corpus)returnself
[docs]deftransform(self,X,y=None):"""Transforms data X by applying the LSA pipeline. Args: X (pd.DataFrame): The data to transform. y (pd.Series, optional): Ignored. Returns: pd.DataFrame: Transformed X. The original column is removed and replaced with two columns of the format `LSA(original_column_name)[feature_number]`, where `feature_number` is 0 or 1. """X_ww=infer_feature_types(X)iflen(self._text_columns)==0:returnX_wwprovenance={}forcolinself._text_columns:transformed=self._lsa_pipeline.transform(X_ww[col])X_ww.ww["LSA({})[0]".format(col)]=pd.Series(transformed[:,0],index=X_ww.index,)X_ww.ww["LSA({})[1]".format(col)]=pd.Series(transformed[:,1],index=X_ww.index,)provenance[col]=["LSA({})[0]".format(col),"LSA({})[1]".format(col)]self._provenance=provenanceX_t=X_ww.ww.drop(columns=self._text_columns)returnX_t