import string import warnings import featuretools as ft import pandas as pd from nlp_primitives import ( LSA, DiversityScore, MeanCharactersPerWord, PartOfSpeechCount, PolarityScore ) from evalml.pipelines.components.transformers import Transformer [docs]class TextFeaturizer(Transformer): """Transformer that can automatically featurize text columns.""" name = "Text Featurization Component" hyperparameter_ranges = {} [docs] def __init__(self, text_columns=None, random_state=0, **kwargs): """Extracts features from text columns using featuretools' nlp_primitives Arguments: text_colums (list): list of `pd.DataFrame` column names that contain text. random_state (int, np.random.RandomState): Seed for the random number generator. """ text_columns = text_columns or [] parameters = {'text_columns': text_columns} parameters.update(kwargs) if len(text_columns) == 0: warnings.warn("No text columns were given to TextFeaturizer, component will have no effect", RuntimeWarning) for i, col_name in enumerate(text_columns): if not isinstance(col_name, str): text_columns[i] = str(col_name) self.text_col_names = text_columns self._features = None super().__init__(parameters=parameters, component_obj=None, random_state=random_state) def _clean_text(self, X): def normalize(text): text = text.translate(str.maketrans('', '', string.punctuation)) return text.lower() for text_col in self.text_col_names: X[text_col] = X[text_col].apply(normalize) return X def _verify_col_names(self, col_names): missing_cols = [] for col in self.text_col_names: if col not in col_names: missing_cols.append(col) if len(missing_cols) > 0: if len(missing_cols) == len(self.text_col_names): raise RuntimeError("None of the provided text column names match the columns in the given DataFrame") for col in missing_cols: self.text_col_names.remove(col) warnings.warn("Columns {} were not found in the given DataFrame, ignoring".format(missing_cols), RuntimeWarning) def _verify_col_types(self, entity_set): var_types = entity_set.entities[0].variable_types for col in self.text_col_names: if var_types[col] is not ft.variable_types.variable.Text: raise ValueError("Column {} is not a text column, cannot apply TextFeaturizer component".format(col)) [docs] def fit(self, X, y=None): if len(self.text_col_names) == 0: self._features = [] return self if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X).rename(columns=str) self._verify_col_names(X.columns) X_text = X[self.text_col_names] X_text['index'] = range(len(X_text)) es = ft.EntitySet() es = es.entity_from_dataframe(entity_id='X', dataframe=X_text, index='index') self._verify_col_types(es) es.df = self._clean_text(X) trans = [DiversityScore, LSA, MeanCharactersPerWord, PartOfSpeechCount, PolarityScore] self._features = ft.dfs(entityset=es, target_entity='X', trans_primitives=trans, features_only=True) return self [docs] def transform(self, X, y=None): """Transforms data X by creating new features using existing text columns Arguments: X (pd.DataFrame): Data to transform y (pd.Series, optional): Input Labels Returns: pd.DataFrame: Transformed X """ if self._features is None: raise RuntimeError(f"You must fit {self.name} before calling transform!") if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) if len(self._features) == 0: return X X = X.rename(columns=str) self._verify_col_names(X.columns) X_text = X[self.text_col_names] X_text['index'] = range(len(X_text)) X_t = X.drop(self.text_col_names, axis=1) es = ft.EntitySet() es = es.entity_from_dataframe(entity_id='X', dataframe=X_text, index='index') self._verify_col_types(es) es.df = self._clean_text(X) feature_matrix = ft.calculate_feature_matrix(features=self._features, entityset=es, verbose=True) X_t = pd.concat([X_t, feature_matrix.reindex(X.index)], axis=1) return X_t