Source code for evalml.pipelines.components.transformers.imputers.target_imputer

"""Component that imputes missing target data according to a specified imputation strategy."""
from functools import wraps

import pandas as pd
import woodwork as ww
from sklearn.impute import SimpleImputer as SkImputer
from woodwork.logical_types import Categorical, Integer, IntegerNullable

from evalml.exceptions import ComponentNotYetFittedError
from evalml.pipelines.components import ComponentBaseMeta
from evalml.pipelines.components.transformers import Transformer
from evalml.utils import infer_feature_types


[docs]class TargetImputerMeta(ComponentBaseMeta):
    """A version of the ComponentBaseMeta class which handles when input features is None."""

[docs]    @classmethod
    def check_for_fit(cls, method):
        """`check_for_fit` wraps a method that validates if `self._is_fitted` is `True`.

        Args:
            method (callable): Method to wrap.

        Raises:
            ComponentNotYetFittedError: If component is not fitted.

        Returns:
            The wrapped input method.
        """

        @wraps(method)
        def _check_for_fit(self, X=None, y=None):
            klass = type(self).__name__
            if not self._is_fitted and self.needs_fitting:
                raise ComponentNotYetFittedError(
                    f"This {klass} is not fitted yet. You must fit {klass} before calling {method.__name__}.",
                )
            else:
                return method(self, X, y)

        return _check_for_fit


[docs]class TargetImputer(Transformer, metaclass=TargetImputerMeta):
    """Imputes missing target data according to a specified imputation strategy.

    Args:
        impute_strategy (string): Impute strategy to use. Valid values include "mean", "median", "most_frequent", "constant" for
           numerical data, and "most_frequent", "constant" for object data types. Defaults to "most_frequent".
        fill_value (string): When impute_strategy == "constant", fill_value is used to replace missing data.
           Defaults to None which uses 0 when imputing numerical data and "missing_value" for strings or object data types.
        random_seed (int): Seed for the random number generator. Defaults to 0.
    """

    name = "Target Imputer"
    hyperparameter_ranges = {"impute_strategy": ["mean", "median", "most_frequent"]}
    """{
        "impute_strategy": ["mean", "median", "most_frequent"]
    }"""
    modifies_features = False
    modifies_target = True

    def __init__(
        self, impute_strategy="most_frequent", fill_value=None, random_seed=0, **kwargs
    ):
        parameters = {"impute_strategy": impute_strategy, "fill_value": fill_value}
        parameters.update(kwargs)
        imputer = SkImputer(strategy=impute_strategy, fill_value=fill_value, **kwargs)
        super().__init__(
            parameters=parameters,
            component_obj=imputer,
            random_seed=random_seed,
        )

[docs]    def fit(self, X, y):
        """Fits imputer to target data. 'None' values are converted to np.nan before imputation and are treated as the same.

        Args:
            X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]. Ignored.
            y (pd.Series, optional): The target training data of length [n_samples].

        Returns:
            self

        Raises:
            TypeError: If target is filled with all null values.
        """
        if y is None:
            return self
        y = infer_feature_types(y)
        if all(y.isnull()):
            raise TypeError("Provided target full of nulls.")
        y = y.to_frame()

        # Convert all bool dtypes to category for fitting
        if (y.dtypes == bool).all():
            y = y.astype("category")

        self._component_obj.fit(y)
        return self

[docs]    def transform(self, X, y):
        """Transforms input target data by imputing missing values. 'None' and np.nan values are treated as the same.

        Args:
            X (pd.DataFrame): Features. Ignored.
            y (pd.Series): Target data to impute.

        Returns:
            (pd.DataFrame, pd.Series): The original X, transformed y
        """
        if X is not None:
            X = infer_feature_types(X)
        if y is None:
            return X, None
        y_ww = infer_feature_types(y)
        y_df = y_ww.ww.to_frame()

        # Return early since bool dtype doesn't support nans and sklearn errors if all cols are bool
        if (y_df.dtypes == bool).all():
            return X, y_ww

        transformed = self._component_obj.transform(y_df)
        y_t = pd.Series(transformed[:, 0], index=y_ww.index)

        # TODO: Fix this after WW adds inference of object type booleans to BooleanNullable
        # Iterate through categorical columns that might have been boolean and convert them back to boolean
        if {True, False}.issubset(set(y_t.unique())) and isinstance(
            y_ww.ww.logical_type,
            Categorical,
        ):
            y_t = y_t.astype(bool)

        new_logical_type = (
            Integer
            if isinstance(y_ww.ww.logical_type, IntegerNullable)
            else y_ww.ww.logical_type
        )

        y_t = ww.init_series(
            y_t,
            logical_type=new_logical_type,
            semantic_tags=y_ww.ww.semantic_tags,
        )
        return X, y_t

[docs]    def fit_transform(self, X, y):
        """Fits on and transforms the input target data.

        Args:
            X (pd.DataFrame): Features. Ignored.
            y (pd.Series): Target data to impute.

        Returns:
            (pd.DataFrame, pd.Series): The original X, transformed y
        """
        return self.fit(X, y).transform(X, y)