Source code for evalml.pipelines.components.transformers.imputers.target_imputer

"""Component that imputes missing target data according to a specified imputation strategy."""
from functools import wraps

import pandas as pd
import woodwork as ww
from sklearn.impute import SimpleImputer as SkImputer
from woodwork.logical_types import Categorical, Integer, IntegerNullable

from evalml.exceptions import ComponentNotYetFittedError
from evalml.pipelines.components import ComponentBaseMeta
from evalml.pipelines.components.transformers import Transformer
from evalml.utils import infer_feature_types


[docs]class TargetImputerMeta(ComponentBaseMeta): """A version of the ComponentBaseMeta class which handles when input features is None."""
[docs] @classmethod def check_for_fit(cls, method): """`check_for_fit` wraps a method that validates if `self._is_fitted` is `True`. Args: method (callable): Method to wrap. Raises: ComponentNotYetFittedError: If component is not fitted. Returns: The wrapped input method. """ @wraps(method) def _check_for_fit(self, X=None, y=None): klass = type(self).__name__ if not self._is_fitted and self.needs_fitting: raise ComponentNotYetFittedError( f"This {klass} is not fitted yet. You must fit {klass} before calling {method.__name__}.", ) else: return method(self, X, y) return _check_for_fit
[docs]class TargetImputer(Transformer, metaclass=TargetImputerMeta): """Imputes missing target data according to a specified imputation strategy. Args: impute_strategy (string): Impute strategy to use. Valid values include "mean", "median", "most_frequent", "constant" for numerical data, and "most_frequent", "constant" for object data types. Defaults to "most_frequent". fill_value (string): When impute_strategy == "constant", fill_value is used to replace missing data. Defaults to None which uses 0 when imputing numerical data and "missing_value" for strings or object data types. random_seed (int): Seed for the random number generator. Defaults to 0. """ name = "Target Imputer" hyperparameter_ranges = {"impute_strategy": ["mean", "median", "most_frequent"]} """{ "impute_strategy": ["mean", "median", "most_frequent"] }""" modifies_features = False modifies_target = True def __init__( self, impute_strategy="most_frequent", fill_value=None, random_seed=0, **kwargs ): parameters = {"impute_strategy": impute_strategy, "fill_value": fill_value} parameters.update(kwargs) imputer = SkImputer(strategy=impute_strategy, fill_value=fill_value, **kwargs) super().__init__( parameters=parameters, component_obj=imputer, random_seed=random_seed, )
[docs] def fit(self, X, y): """Fits imputer to target data. 'None' values are converted to np.nan before imputation and are treated as the same. Args: X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]. Ignored. y (pd.Series, optional): The target training data of length [n_samples]. Returns: self Raises: TypeError: If target is filled with all null values. """ if y is None: return self y = infer_feature_types(y) if all(y.isnull()): raise TypeError("Provided target full of nulls.") y = y.to_frame() # Convert all bool dtypes to category for fitting if (y.dtypes == bool).all(): y = y.astype("category") self._component_obj.fit(y) return self
[docs] def transform(self, X, y): """Transforms input target data by imputing missing values. 'None' and np.nan values are treated as the same. Args: X (pd.DataFrame): Features. Ignored. y (pd.Series): Target data to impute. Returns: (pd.DataFrame, pd.Series): The original X, transformed y """ if X is not None: X = infer_feature_types(X) if y is None: return X, None y_ww = infer_feature_types(y) y_df = y_ww.ww.to_frame() # Return early since bool dtype doesn't support nans and sklearn errors if all cols are bool if (y_df.dtypes == bool).all(): return X, y_ww transformed = self._component_obj.transform(y_df) y_t = pd.Series(transformed[:, 0], index=y_ww.index) # TODO: Fix this after WW adds inference of object type booleans to BooleanNullable # Iterate through categorical columns that might have been boolean and convert them back to boolean if {True, False}.issubset(set(y_t.unique())) and isinstance( y_ww.ww.logical_type, Categorical, ): y_t = y_t.astype(bool) new_logical_type = ( Integer if isinstance(y_ww.ww.logical_type, IntegerNullable) else y_ww.ww.logical_type ) y_t = ww.init_series( y_t, logical_type=new_logical_type, semantic_tags=y_ww.ww.semantic_tags, ) return X, y_t
[docs] def fit_transform(self, X, y): """Fits on and transforms the input target data. Args: X (pd.DataFrame): Features. Ignored. y (pd.Series): Target data to impute. Returns: (pd.DataFrame, pd.Series): The original X, transformed y """ return self.fit(X, y).transform(X, y)