Source code for evalml.pipelines.components.transformers.imputers.target_imputer

from functools import wraps

import pandas as pd
from sklearn.impute import SimpleImputer as SkImputer

from evalml.exceptions import ComponentNotYetFittedError
from evalml.pipelines.components import ComponentBaseMeta
from evalml.pipelines.components.transformers import Transformer
from evalml.utils import (
    _retain_custom_types_and_initalize_woodwork,
    infer_feature_types,
)


[docs]class TargetImputerMeta(ComponentBaseMeta):
    """A version of the ComponentBaseMeta class which handles when input features is None"""

[docs]    @classmethod
    def check_for_fit(cls, method):
        """`check_for_fit` wraps a method that validates if `self._is_fitted` is `True`.
        It raises an exception if `False` and calls and returns the wrapped method if `True`.
        """

        @wraps(method)
        def _check_for_fit(self, X=None, y=None):
            klass = type(self).__name__
            if not self._is_fitted and self.needs_fitting:
                raise ComponentNotYetFittedError(
                    f"This {klass} is not fitted yet. You must fit {klass} before calling {method.__name__}."
                )
            else:
                return method(self, X, y)

        return _check_for_fit


[docs]class TargetImputer(Transformer, metaclass=TargetImputerMeta):
    """Imputes missing target data according to a specified imputation strategy.

    Arguments:
        impute_strategy (string): Impute strategy to use. Valid values include "mean", "median", "most_frequent", "constant" for
           numerical data, and "most_frequent", "constant" for object data types. Defaults to "most_frequent".
        fill_value (string): When impute_strategy == "constant", fill_value is used to replace missing data.
           Defaults to None which uses 0 when imputing numerical data and "missing_value" for strings or object data types.
        random_seed (int): Seed for the random number generator. Defaults to 0.
    """

    name = "Target Imputer"
    hyperparameter_ranges = {"impute_strategy": ["mean", "median", "most_frequent"]}
    """{
        "impute_strategy": ["mean", "median", "most_frequent"]
    }"""
    modifies_features = False
    modifies_target = True

    def __init__(
        self, impute_strategy="most_frequent", fill_value=None, random_seed=0, **kwargs
    ):
        parameters = {"impute_strategy": impute_strategy, "fill_value": fill_value}
        parameters.update(kwargs)
        imputer = SkImputer(strategy=impute_strategy, fill_value=fill_value, **kwargs)
        super().__init__(
            parameters=parameters, component_obj=imputer, random_seed=random_seed
        )

[docs]    def fit(self, X, y):
        """Fits imputer to target data. 'None' values are converted to np.nan before imputation and are
            treated as the same.

        Arguments:
            X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]. Ignored.
            y (pd.Series, optional): The target training data of length [n_samples].

        Returns:
            self
        """
        if y is None:
            return self
        y = infer_feature_types(y).to_frame()

        # Convert all bool dtypes to category for fitting
        if (y.dtypes == bool).all():
            y = y.astype("category")

        self._component_obj.fit(y)
        return self

[docs]    def transform(self, X, y):
        """Transforms input target data by imputing missing values. 'None' and np.nan values are treated as the same.

        Arguments:
            X (pd.DataFrame): Features. Ignored.
            y (pd.Series): Target data to impute.

        Returns:
            (pd.DataFrame, pd.Series): The original X, transformed y
        """

        if X is not None:
            X = infer_feature_types(X)
        if y is None:
            return X, None
        y_ww = infer_feature_types(y)
        y_df = y_ww.ww.to_frame()

        # Return early since bool dtype doesn't support nans and sklearn errors if all cols are bool
        if (y_df.dtypes == bool).all():
            return X, _retain_custom_types_and_initalize_woodwork(
                y_ww.ww.logical_type, y
            )

        transformed = self._component_obj.transform(y_df)
        if transformed.shape[1] == 0:
            raise RuntimeError("Transformed data is empty")
        y_t = pd.Series(transformed[:, 0], index=y_ww.index)
        return X, _retain_custom_types_and_initalize_woodwork(y_ww.ww.logical_type, y_t)

[docs]    def fit_transform(self, X, y):
        """Fits on and transforms the input target data.

        Arguments:
            X (pd.DataFrame): Features. Ignored.
            y (pd.Series): Target data to impute.

        Returns:
            (pd.DataFrame, pd.Series): The original X, transformed y
        """
        return self.fit(X, y).transform(X, y)