Source code for evalml.pipelines.components.transformers.imputers.simple_imputer

"""Component that imputes missing data according to a specified imputation strategy."""
import pandas as pd
import woodwork
from sklearn.impute import SimpleImputer as SkImputer

from evalml.pipelines.components.transformers import Transformer
from evalml.utils import infer_feature_types
from evalml.utils.nullable_type_utils import _get_new_logical_types_for_imputed_data

[docs]class SimpleImputer(Transformer): """Imputes missing data according to a specified imputation strategy. Natural language columns are ignored. Args: impute_strategy (string): Impute strategy to use. Valid values include "mean", "median", "most_frequent", "constant" for numerical data, and "most_frequent", "constant" for object data types. fill_value (string): When impute_strategy == "constant", fill_value is used to replace missing data. Defaults to 0 when imputing numerical data and "missing_value" for strings or object data types. random_seed (int): Seed for the random number generator. Defaults to 0. """ name = "Simple Imputer" hyperparameter_ranges = {"impute_strategy": ["mean", "median", "most_frequent"]} """{ "impute_strategy": ["mean", "median", "most_frequent"] }""" def __init__( self, impute_strategy="most_frequent", fill_value=None, random_seed=0, **kwargs ): parameters = {"impute_strategy": impute_strategy, "fill_value": fill_value} parameters.update(kwargs) self.impute_strategy = impute_strategy imputer = SkImputer( strategy=impute_strategy, fill_value=fill_value, missing_values=pd.NA, **kwargs, ) self._all_null_cols = None super().__init__( parameters=parameters, component_obj=imputer, random_seed=random_seed, )
[docs] def fit(self, X, y=None): """Fits imputer to data. 'None' values are converted to np.nan before imputation and are treated as the same. Args: X (pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features] y (pd.Series, optional): the target training data of length [n_samples] Returns: self Raises: ValueError: if the SimpleImputer receives a dataframe with both Boolean and Categorical data. """ X = infer_feature_types(X) if set([lt.type_string for lt in X.ww.logical_types.values()]) == { "boolean", "categorical", }: raise ValueError( "SimpleImputer cannot handle dataframes with both boolean and categorical features. Use Imputer instead.", ) nan_ratio = X.isna().sum() / X.shape[0] # Keep track of the different types of data in X self._all_null_cols = nan_ratio[nan_ratio == 1].index.tolist() self._natural_language_cols = list( "NaturalLanguage", return_schema=True, ).columns.keys(), ) # Only impute data that is not natural language columns or fully null self._cols_to_impute = [ col for col in X.columns if col not in self._natural_language_cols and col not in self._all_null_cols ] # If there are no columns to impute, return early if not self._cols_to_impute: return self X = X[self._cols_to_impute] if (X.dtypes == bool).all(): # Ensure that _component_obj still gets fit so that if any of the dtypes are different # at transform, we've fit the component. This is needed because sklearn doesn't allow # data with only bool dtype to be passed in. X = X.astype("boolean"), y) return self
[docs] def transform(self, X, y=None): """Transforms input by imputing missing values. 'None' and np.nan values are treated as the same. Args: X (pd.DataFrame): Data to transform. y (pd.Series, optional): Ignored. Returns: pd.DataFrame: Transformed X """ # Record original data X = infer_feature_types(X) original_schema = X.ww.schema original_index = X.index # separate out just the columns we are imputing X_t = X[self._cols_to_impute] if not self._cols_to_impute or (X_t.dtypes == bool).all(): # If there are no columns to impute or all columns to impute are bool dtype, # which will never have null values, return the original data without any fully null columns not_all_null_cols = [ col for col in X.columns if col not in self._all_null_cols ] return X.ww[not_all_null_cols] # Transform the data X_t = self._component_obj.transform(X_t) X_t = pd.DataFrame(X_t, columns=self._cols_to_impute) # Reinit woodwork, maintaining original types where possible imputed_schema = original_schema.get_subset_schema(self._cols_to_impute) new_logical_types = _get_new_logical_types_for_imputed_data( impute_strategy=self.impute_strategy, original_schema=imputed_schema, ) X_t.ww.init(schema=imputed_schema, logical_types=new_logical_types) # Add back in the unchanged original natural language columns that we want to keep if len(self._natural_language_cols) > 0: X_t = woodwork.concat_columns([X_t, X.ww[self._natural_language_cols]]) # reorder columns to match original X_t = X_t.ww[[col for col in original_schema.columns if col in X_t.columns]] if self._cols_to_impute: X_t.index = original_index return X_t
[docs] def fit_transform(self, X, y=None): """Fits on X and transforms X. Args: X (pd.DataFrame): Data to fit and transform y (pd.Series, optional): Target data. Returns: pd.DataFrame: Transformed X """ return, y).transform(X, y)