Source code for evalml.pipelines.components.transformers.preprocessing.transform_primitive_components

"""Components that extract features from the input data."""
from abc import abstractmethod

import featuretools as ft
import woodwork as ww

from evalml.pipelines.components.transformers.transformer import Transformer
from evalml.utils import infer_feature_types


class _ExtractFeaturesWithTransformPrimitives(Transformer):
    hyperparameter_ranges = {}
    """{}"""

    def __init__(self, random_seed=0, **kwargs):
        self._columns = None
        self._features = None
        super().__init__(random_seed=random_seed, **kwargs)

    @property
    @classmethod
    @abstractmethod
    def _transform_primitives(cls):
        """Return the transform primitives extracted from this component."""

    @abstractmethod
    def _get_columns_to_transform(self, X):
        """Return the columns that the primitives will transform."""

    @abstractmethod
    def _get_feature_types_for_featuretools(self, X):
        """Get a mapping from column name to the feature tools type.

        This is needed for dfs. Hopefully, once the ww/ft integration is
        complete this will be redundant.
        """

    def _make_entity_set(self, X):
        X_to_transform = X[self._columns]
        X_to_transform.rename(columns=str, inplace=True)
        ww_logical_types = self._get_feature_types_for_featuretools(X)
        es = ft.EntitySet()
        es.add_dataframe(
            dataframe_name="X",
            dataframe=X_to_transform,
            index="index",
            make_index=True,
            logical_types=ww_logical_types,
        )
        return es

    def fit(self, X, y=None):
        X = infer_feature_types(X)
        self._columns = self._get_columns_to_transform(X)
        if len(self._columns) == 0:
            return self

        es = self._make_entity_set(X)
        self._features = ft.dfs(
            entityset=es,
            target_dataframe_name="X",
            trans_primitives=self._transform_primitives,
            max_depth=1,
            features_only=True,
        )
        return self

    def transform(self, X, y=None):
        X_ww = infer_feature_types(X)
        if self._features is None or len(self._features) == 0:
            return X_ww

        es = self._make_entity_set(X_ww)
        features = ft.calculate_feature_matrix(features=self._features, entityset=es)

        # Convert to object dtype so that pd.NA is converted to np.nan
        # until sklearn imputer can handle pd.NA in release 1.1
        # FT returns these as string types, currently there isn't much difference
        # in terms of performance between object and string
        # see https://pandas.pydata.org/docs/user_guide/text.html#text-data-types
        # "Currently, the performance of object dtype arrays of strings
        # "and arrays.StringArray are about the same."
        features = features.astype(object, copy=False)
        features.index = X_ww.index
        features.ww.init(logical_types={col_: "categorical" for col_ in features})

        X_ww = X_ww.ww.drop(self._columns)
        X_ww = ww.concat_columns([X_ww, features])

        return X_ww

    @staticmethod
    def _get_primitives_provenance(features):
        provenance = {}
        for feature in features:
            input_col = feature.base_features[0].get_name()
            # Return a copy because `get_feature_names` returns a reference to the names
            output_features = [name for name in feature.get_feature_names()]
            if input_col not in provenance:
                provenance[input_col] = output_features
            else:
                provenance[input_col] += output_features
        return provenance

    def _get_feature_provenance(self):
        provenance = {}
        if self._columns:
            provenance = self._get_primitives_provenance(self._features)
        return provenance


[docs]class EmailFeaturizer(_ExtractFeaturesWithTransformPrimitives): """Transformer that can automatically extract features from emails. Args: random_seed (int): Seed for the random number generator. Defaults to 0. """ name = "Email Featurizer" _transform_primitives = [ ft.primitives.IsFreeEmailDomain, ft.primitives.EmailAddressToDomain, ] def _get_columns_to_transform(self, X): return list(X.ww.select("EmailAddress", return_schema=True).columns) def _get_feature_types_for_featuretools(self, X): return { col_name: ww.logical_types.EmailAddress.type_string for col_name in self._columns }
[docs]class URLFeaturizer(_ExtractFeaturesWithTransformPrimitives): """Transformer that can automatically extract features from URL. Args: random_seed (int): Seed for the random number generator. Defaults to 0. """ name = "URL Featurizer" _transform_primitives = [ ft.primitives.URLToTLD, ft.primitives.URLToDomain, ft.primitives.URLToProtocol, ] def _get_columns_to_transform(self, X): return list(X.ww.select("URL", return_schema=True).columns) def _get_feature_types_for_featuretools(self, X): return { col_name: ww.logical_types.URL.type_string for col_name in self._columns }