Source code for evalml.pipelines.components.transformers.imputers.per_column_imputer
"""Component that imputes missing data according to a specified imputation strategy per column."""importwarningsfromevalml.pipelines.components.transformersimportTransformerfromevalml.pipelines.components.transformers.imputers.simple_imputerimport(SimpleImputer,)fromevalml.utilsimportinfer_feature_types
[docs]classPerColumnImputer(Transformer):"""Imputes missing data according to a specified imputation strategy per column. Args: impute_strategies (dict): Column and {"impute_strategy": strategy, "fill_value":value} pairings. Valid values for impute strategy include "mean", "median", "most_frequent", "constant" for numerical data, and "most_frequent", "constant" for object data types. Defaults to None, which uses "most_frequent" for all columns. When impute_strategy == "constant", fill_value is used to replace missing data. When None, uses 0 when imputing numerical data and "missing_value" for strings or object data types. random_seed (int): Seed for the random number generator. Defaults to 0. """name="Per Column Imputer"hyperparameter_ranges={}"""{}"""def__init__(self,impute_strategies=None,random_seed=0,**kwargs,):parameters={"impute_strategies":impute_strategies,}self.imputers=Noneself.impute_strategies=impute_strategiesordict()ifnotisinstance(self.impute_strategies,dict):raiseValueError("`impute_strategies` is not a dictionary. Please provide in Column and {`impute_strategy`: strategy, `fill_value`:value} pairs. ",)super().__init__(parameters=parameters,component_obj=None,random_seed=random_seed,)
[docs]deffit(self,X,y=None):"""Fits imputers on input data. Args: X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] to fit. y (pd.Series, optional): The target training data of length [n_samples]. Ignored. Returns: self """X=infer_feature_types(X)self.imputers=dict()columns_to_impute=self.impute_strategies.keys()iflen(columns_to_impute)==0:warnings.warn("No columns to impute. Please check `impute_strategies` parameter.",)forcolumnincolumns_to_impute:strategy_dict=self.impute_strategies.get(column,dict())strategy=strategy_dict["impute_strategy"]fill_value=strategy_dict.get("fill_value",None)self.imputers[column]=SimpleImputer(impute_strategy=strategy,fill_value=fill_value,)forcolumn,imputerinself.imputers.items():imputer.fit(X.ww[[column]])returnself
[docs]deftransform(self,X,y=None):"""Transforms input data by imputing missing values. Args: X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] to transform. y (pd.Series, optional): The target training data of length [n_samples]. Ignored. Returns: pd.DataFrame: Transformed X """X_ww=infer_feature_types(X)original_schema=X_ww.ww.schemacols_to_drop=[]forcolumn,imputerinself.imputers.items():transformed=imputer.transform(X_ww.ww[[column]])iftransformed.empty:cols_to_drop.append(column)else:X_ww.ww[column]=transformed[column]X_t=X_ww.ww.drop(cols_to_drop)X_t.ww.init(schema=original_schema.get_subset_schema(X_t.columns))returnX_t