Source code for evalml.pipelines.components.transformers.preprocessing.lsa
"""Transformer to calculate the Latent Semantic Analysis Values of text input."""
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from evalml.pipelines.components.transformers.preprocessing import (
TextTransformer,
)
from evalml.utils import infer_feature_types
[docs]class LSA(TextTransformer):
"""Transformer to calculate the Latent Semantic Analysis Values of text input.
Args:
random_seed (int): Seed for the random number generator. Defaults to 0.
"""
name = "LSA Transformer"
hyperparameter_ranges = {}
"""{}"""
def __init__(self, random_seed=0, **kwargs):
self._lsa_pipeline = make_pipeline(
TfidfVectorizer(), TruncatedSVD(random_state=random_seed)
)
self._provenance = {}
super().__init__(random_seed=random_seed, **kwargs)
[docs] def fit(self, X, y=None):
"""Fits the input data.
Args:
X (pd.DataFrame): The data to transform.
y (pd.Series, optional): Ignored.
Returns:
self
"""
X = infer_feature_types(X)
self._text_columns = self._get_text_columns(X)
if len(self._text_columns) == 0:
return self
corpus = X[self._text_columns].values.flatten()
# we assume non-str values will have been filtered out prior to calling LSA.fit. this is a safeguard.
corpus = corpus.astype(str)
self._lsa_pipeline.fit(corpus)
return self
def _get_feature_provenance(self):
return self._provenance