Source code for evalml.pipelines.components.transformers.preprocessing.lsa
"""Transformer to calculate the Latent Semantic Analysis Values of text input."""
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from evalml.pipelines.components.transformers.preprocessing import (
TextTransformer,
)
from evalml.utils import infer_feature_types
[docs]class LSA(TextTransformer):
"""Transformer to calculate the Latent Semantic Analysis Values of text input.
Args:
random_seed (int): Seed for the random number generator. Defaults to 0.
"""
name = "LSA Transformer"
hyperparameter_ranges = {}
"""{}"""
def __init__(self, random_seed=0, **kwargs):
self._lsa_pipeline = make_pipeline(
TfidfVectorizer(), TruncatedSVD(random_state=random_seed)
)
self._provenance = {}
super().__init__(random_seed=random_seed, **kwargs)
[docs] def fit(self, X, y=None):
"""Fits the input data.
Args:
X (pd.DataFrame): The data to transform.
y (pd.Series, optional): Ignored.
Returns:
self
"""
X = infer_feature_types(X)
self._text_columns = self._get_text_columns(X)
if len(self._text_columns) == 0:
return self
corpus = X[self._text_columns].values.flatten()
# we assume non-str values will have been filtered out prior to calling LSA.fit. this is a safeguard.
corpus = corpus.astype(str)
self._lsa_pipeline.fit(corpus)
return self
[docs] def transform(self, X, y=None):
"""Transforms data X by applying the LSA pipeline.
Args:
X (pd.DataFrame): The data to transform.
y (pd.Series, optional): Ignored.
Returns:
pd.DataFrame: Transformed X. The original column is removed and replaced with two columns of the
format `LSA(original_column_name)[feature_number]`, where `feature_number` is 0 or 1.
"""
X_ww = infer_feature_types(X)
if len(self._text_columns) == 0:
return X_ww
provenance = {}
for col in self._text_columns:
transformed = self._lsa_pipeline.transform(X_ww[col])
X_ww.ww["LSA({})[0]".format(col)] = pd.Series(
transformed[:, 0], index=X_ww.index
)
X_ww.ww["LSA({})[1]".format(col)] = pd.Series(
transformed[:, 1], index=X_ww.index
)
provenance[col] = ["LSA({})[0]".format(col), "LSA({})[1]".format(col)]
self._provenance = provenance
X_t = X_ww.ww.drop(columns=self._text_columns)
return X_t
def _get_feature_provenance(self):
return self._provenance