Source code for evalml.pipelines.components.transformers.imputers.imputer
"""Component that imputes missing data according to a specified imputation strategy."""importpandasaspdfromevalml.pipelines.components.transformersimportTransformerfromevalml.pipelines.components.transformers.imputersimportKNNImputer,SimpleImputerfromevalml.utilsimportinfer_feature_types
[docs]classImputer(Transformer):"""Imputes missing data according to a specified imputation strategy. Args: categorical_impute_strategy (string): Impute strategy to use for string, object, boolean, categorical dtypes. Valid values include "most_frequent" and "constant". numeric_impute_strategy (string): Impute strategy to use for numeric columns. Valid values include "mean", "median", "most_frequent", and "constant". boolean_impute_strategy (string): Impute strategy to use for boolean columns. Valid values include "most_frequent" and "constant". categorical_fill_value (string): When categorical_impute_strategy == "constant", fill_value is used to replace missing data. The default value of None will fill with the string "missing_value". numeric_fill_value (int, float): When numeric_impute_strategy == "constant", fill_value is used to replace missing data. The default value of None will fill with 0. boolean_fill_value (bool): When boolean_impute_strategy == "constant", fill_value is used to replace missing data. The default value of None will fill with True. random_seed (int): Seed for the random number generator. Defaults to 0. """name="Imputer"hyperparameter_ranges={"categorical_impute_strategy":["most_frequent"],"numeric_impute_strategy":["mean","median","most_frequent","knn"],"boolean_impute_strategy":["most_frequent"],}"""{ "categorical_impute_strategy": ["most_frequent"], "numeric_impute_strategy": ["mean", "median", "most_frequent", "knn"], "boolean_impute_strategy": ["most_frequent"] }"""_valid_categorical_impute_strategies=set(["most_frequent","constant"])_valid_numeric_impute_strategies=set(["mean","median","most_frequent","constant","knn"],)_valid_boolean_impute_strategies=set(["most_frequent","constant"])def__init__(self,categorical_impute_strategy="most_frequent",categorical_fill_value=None,numeric_impute_strategy="mean",numeric_fill_value=None,boolean_impute_strategy="most_frequent",boolean_fill_value=None,random_seed=0,**kwargs,):ifcategorical_impute_strategynotinself._valid_categorical_impute_strategies:raiseValueError(f"{categorical_impute_strategy} is an invalid parameter. Valid categorical imputation strategies are {', '.join(self._valid_numeric_impute_strategies)}",)ifnumeric_impute_strategynotinself._valid_numeric_impute_strategies:raiseValueError(f"{numeric_impute_strategy} is an invalid parameter. Valid numeric imputation strategies are {', '.join(self._valid_numeric_impute_strategies)}",)ifboolean_impute_strategynotinself._valid_boolean_impute_strategies:raiseValueError(f"{boolean_impute_strategy} is an invalid parameter. Valid boolean imputation strategies are {', '.join(self._valid_boolean_impute_strategies)}",)parameters={"categorical_impute_strategy":categorical_impute_strategy,"numeric_impute_strategy":numeric_impute_strategy,"boolean_impute_strategy":boolean_impute_strategy,"categorical_fill_value":categorical_fill_value,"numeric_fill_value":numeric_fill_value,"boolean_fill_value":boolean_fill_value,}parameters.update(kwargs)self._categorical_imputer=SimpleImputer(impute_strategy=categorical_impute_strategy,fill_value=categorical_fill_value,**kwargs,)self._boolean_imputer=SimpleImputer(impute_strategy=boolean_impute_strategy,fill_value=boolean_fill_value,**kwargs,)ifnumeric_impute_strategy=="knn":self._numeric_imputer=KNNImputer(number_neighbors=3,**kwargs,)else:self._numeric_imputer=SimpleImputer(impute_strategy=numeric_impute_strategy,fill_value=numeric_fill_value,**kwargs,)self._all_null_cols=Noneself._numeric_cols=Noneself._categorical_cols=Noneself._boolean_cols=Nonesuper().__init__(parameters=parameters,component_obj=None,random_seed=random_seed,)
[docs]deffit(self,X,y=None):"""Fits imputer to data. 'None' values are converted to np.nan before imputation and are treated as the same. Args: X (pd.DataFrame, np.ndarray): The input training data of shape [n_samples, n_features] y (pd.Series, optional): The target training data of length [n_samples] Returns: self """X=infer_feature_types(X)cat_cols=list(X.ww.select(["category"],return_schema=True).columns)bool_cols=list(X.ww.select(["BooleanNullable","Boolean"],return_schema=True).columns,)numeric_cols=list(X.ww.select(["numeric"],return_schema=True).columns)nan_ratio=X.isna().sum()/X.shape[0]self._all_null_cols=nan_ratio[nan_ratio==1].index.tolist()X_numerics=X.ww[[colforcolinnumeric_colsifcolnotinself._all_null_cols]]iflen(X_numerics.columns)>0:self._numeric_imputer.fit(X_numerics,y)self._numeric_cols=X_numerics.columnsX_categorical=X.ww[[colforcolincat_colsifcolnotinself._all_null_cols]]iflen(X_categorical.columns)>0:self._categorical_imputer.fit(X_categorical,y)self._categorical_cols=X_categorical.columnsX_boolean=X.ww[[colforcolinbool_colsifcolnotinself._all_null_cols]]iflen(X_boolean.columns)>0:self._boolean_imputer.fit(X_boolean,y)self._boolean_cols=X_boolean.columnsreturnself
[docs]deftransform(self,X,y=None):"""Transforms data X by imputing missing values. Args: X (pd.DataFrame): Data to transform y (pd.Series, optional): Ignored. Returns: pd.DataFrame: Transformed X """X=infer_feature_types(X)iflen(self._all_null_cols)==X.shape[1]:df=pd.DataFrame(index=X.index)df.ww.init()returndfX_no_all_null=X.ww.drop(self._all_null_cols)original_schema=X_no_all_null.ww.schemanew_ltypes=Noneifself._numeric_colsisnotNoneandlen(self._numeric_cols)>0:X_numeric=X.ww[self._numeric_cols.tolist()]imputed=self._numeric_imputer.transform(X_numeric)X_no_all_null[X_numeric.columns]=imputed# Numeric imputing may have changed logical types because of use of _get_new_logical_types_for_imputed_dataifimputed.ww.schemaisNone:imputed.ww.init()new_ltypes=imputed.ww.logical_typesifself._categorical_colsisnotNoneandlen(self._categorical_cols)>0:X_categorical=X.ww[self._categorical_cols.tolist()]imputed=self._categorical_imputer.transform(X_categorical)X_no_all_null[X_categorical.columns]=imputedifself._boolean_colsisnotNoneandlen(self._boolean_cols)>0:X_boolean=X.ww[self._boolean_cols.tolist()]imputed=self._boolean_imputer.transform(X_boolean)X_no_all_null[X_boolean.columns]=imputedX_no_all_null.ww.init(schema=original_schema,logical_types=new_ltypes)returnX_no_all_null