Source code for evalml.pipelines.components.transformers.imputers.simple_imputer
"""Component that imputes missing data according to a specified imputation strategy."""importpandasaspdimportwoodworkfromsklearn.imputeimportSimpleImputerasSkImputerfromwoodwork.logical_typesimportDoublefromevalml.pipelines.components.transformersimportTransformerfromevalml.pipelines.components.utilsimport(drop_natural_language_columns,set_boolean_columns_to_integer,)fromevalml.utilsimportinfer_feature_typesfromevalml.utils.gen_utilsimportis_categorical_actually_boolean
[docs]classSimpleImputer(Transformer):"""Imputes missing data according to a specified imputation strategy. Natural language columns are ignored. Args: impute_strategy (string): Impute strategy to use. Valid values include "mean", "median", "most_frequent", "constant" for numerical data, and "most_frequent", "constant" for object data types. fill_value (string): When impute_strategy == "constant", fill_value is used to replace missing data. Defaults to 0 when imputing numerical data and "missing_value" for strings or object data types. random_seed (int): Seed for the random number generator. Defaults to 0. """name="Simple Imputer"hyperparameter_ranges={"impute_strategy":["mean","median","most_frequent"]}"""{ "impute_strategy": ["mean", "median", "most_frequent"] }"""def__init__(self,impute_strategy="most_frequent",fill_value=None,random_seed=0,**kwargs):parameters={"impute_strategy":impute_strategy,"fill_value":fill_value}parameters.update(kwargs)self.impute_strategy=impute_strategyimputer=SkImputer(strategy=impute_strategy,fill_value=fill_value,missing_values=pd.NA,**kwargs,)self._all_null_cols=Nonesuper().__init__(parameters=parameters,component_obj=imputer,random_seed=random_seed,)
[docs]deffit(self,X,y=None):"""Fits imputer to data. 'None' values are converted to np.nan before imputation and are treated as the same. Args: X (pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features] y (pd.Series, optional): the target training data of length [n_samples] Returns: self Raises: ValueError: if the SimpleImputer receives a dataframe with both Boolean and Categorical data. """X=infer_feature_types(X)ifset([lt.type_stringforltinX.ww.logical_types.values()])=={"boolean","categorical",}andnotall([is_categorical_actually_boolean(X,col)forcolinX.ww.select("Categorical")],):raiseValueError("SimpleImputer cannot handle dataframes with both boolean and categorical features. Use Imputer instead.",)nan_ratio=X.isna().sum()/X.shape[0]self._all_null_cols=nan_ratio[nan_ratio==1].index.tolist()X,_=drop_natural_language_columns(X)# Convert any boolean columns to IntegerNullable, but keep track of the columns so they can be converted backself._boolean_cols=list(X.ww.select(include=["Boolean","BooleanNullable"],return_schema=True,).columns,)# Make sure we're tracking Categorical columns that should be boolean as wellself._boolean_cols.extend([colforcolinX.ww.select("Categorical")ifis_categorical_actually_boolean(X,col)],)X=set_boolean_columns_to_integer(X)# If the Dataframe only had natural language columns, do nothing.ifX.shape[1]==0:returnselfself._component_obj.fit(X,y)returnself
[docs]deftransform(self,X,y=None):"""Transforms input by imputing missing values. 'None' and np.nan values are treated as the same. Args: X (pd.DataFrame): Data to transform. y (pd.Series, optional): Ignored. Returns: pd.DataFrame: Transformed X """X=infer_feature_types(X)original_schema=X.ww.schemaX=set_boolean_columns_to_integer(X)not_all_null_cols=[colforcolinX.columnsifcolnotinself._all_null_cols]original_index=X.index# Drop natural language columns and transform the other columnsX_t,natural_language_cols=drop_natural_language_columns(X)ifX_t.shape[1]==0:returnXnot_all_null_or_natural_language_cols=[colforcolinnot_all_null_colsifcolnotinnatural_language_cols]X_t=self._component_obj.transform(X_t)X_t=pd.DataFrame(X_t,columns=not_all_null_or_natural_language_cols)new_schema=original_schema.get_subset_schema(X_t.columns)# Iterate through previously saved boolean columns and convert them back to booleanforcolinself._boolean_cols:X_t[col]=X_t[col].astype(bool)# Convert Nullable Integers to Doubles for the "mean" and "median" strategiesifself.impute_strategyin["mean","median"]:nullable_int_cols=X.ww.select(["IntegerNullable"],return_schema=True)nullable_int_cols=[xforxinnullable_int_cols.columns.keys()]forcolinnullable_int_cols:new_schema.set_types({col:Double})X_t.ww.init(schema=new_schema)# Add back in natural language columns, unchangediflen(natural_language_cols)>0:X_t=woodwork.concat_columns([X_t,X[natural_language_cols]])ifnot_all_null_or_natural_language_cols:X_t.index=original_indexreturnX_t
[docs]deffit_transform(self,X,y=None):"""Fits on X and transforms X. Args: X (pd.DataFrame): Data to fit and transform y (pd.Series, optional): Target data. Returns: pd.DataFrame: Transformed X """returnself.fit(X,y).transform(X,y)