Source code for evalml.pipelines.components.transformers.encoders.onehot_encoder
"""A transformer that encodes categorical features in a one-hot numeric array."""importnumpyasnpimportpandasaspdimportwoodworkaswwfromsklearn.preprocessingimportOneHotEncoderasSKOneHotEncoderfromevalml.pipelines.componentsimportComponentBaseMetafromevalml.pipelines.components.transformers.transformerimportTransformerfromevalml.utilsimportinfer_feature_types
[docs]classOneHotEncoderMeta(ComponentBaseMeta):"""A version of the ComponentBaseMeta class which includes validation on an additional one-hot-encoder-specific method `categories`."""METHODS_TO_CHECK=ComponentBaseMeta.METHODS_TO_CHECK+["categories","get_feature_names",]
[docs]classOneHotEncoder(Transformer,metaclass=OneHotEncoderMeta):"""A transformer that encodes categorical features in a one-hot numeric array. Args: top_n (int): Number of categories per column to encode. If None, all categories will be encoded. Otherwise, the `n` most frequent will be encoded and all others will be dropped. Defaults to 10. features_to_encode (list[str]): List of columns to encode. All other columns will remain untouched. If None, all appropriate columns will be encoded. Defaults to None. categories (list): A two dimensional list of categories, where `categories[i]` is a list of the categories for the column at index `i`. This can also be `None`, or `"auto"` if `top_n` is not None. Defaults to None. drop (string, list): Method ("first" or "if_binary") to use to drop one category per feature. Can also be a list specifying which categories to drop for each feature. Defaults to 'if_binary'. handle_unknown (string): Whether to ignore or error for unknown categories for a feature encountered during `fit` or `transform`. If either `top_n` or `categories` is used to limit the number of categories per column, this must be "ignore". Defaults to "ignore". handle_missing (string): Options for how to handle missing (NaN) values encountered during `fit` or `transform`. If this is set to "as_category" and NaN values are within the `n` most frequent, "nan" values will be encoded as their own column. If this is set to "error", any missing values encountered will raise an error. Defaults to "error". random_seed (int): Seed for the random number generator. Defaults to 0. """name="One Hot Encoder"hyperparameter_ranges={}"""{}"""def__init__(self,top_n=10,features_to_encode=None,categories=None,drop="if_binary",handle_unknown="ignore",handle_missing="error",random_seed=0,**kwargs,):parameters={"top_n":top_n,"features_to_encode":features_to_encode,"categories":categories,"drop":drop,"handle_unknown":handle_unknown,"handle_missing":handle_missing,}parameters.update(kwargs)# Check correct inputsunknown_input_options=["ignore","error"]missing_input_options=["as_category","error"]ifhandle_unknownnotinunknown_input_options:raiseValueError("Invalid input {} for handle_unknown".format(handle_unknown),)ifhandle_missingnotinmissing_input_options:raiseValueError("Invalid input {} for handle_missing".format(handle_missing),)iftop_nisnotNoneandcategoriesisnotNone:raiseValueError("Cannot use categories and top_n arguments simultaneously")self.features_to_encode=features_to_encodeself._encoder=Nonesuper().__init__(parameters=parameters,component_obj=None,random_seed=random_seed,)self._initial_state=self.random_seedself._provenance={}@staticmethoddef_get_cat_cols(X):"""Get names of categorical columns in the input DataFrame."""returnlist(X.ww.select(include=["category"],return_schema=True).columns)
[docs]deffit(self,X,y=None):"""Fits the one-hot encoder component. Args: X (pd.DataFrame): The input training data of shape [n_samples, n_features]. y (pd.Series, optional): The target training data of length [n_samples]. Returns: self Raises: ValueError: If encoding a column failed. """top_n=self.parameters["top_n"]X=infer_feature_types(X)ifself.features_to_encodeisNone:self.features_to_encode=self._get_cat_cols(X)X_t=Xinvalid_features=[colforcolinself.features_to_encodeifcolnotinlist(X.columns)]iflen(invalid_features)>0:raiseValueError("Could not find and encode {} in input data.".format(", ".join(invalid_features),),)X_t=self._handle_parameter_handle_missing(X_t)self._binary_values_to_drop=[]iflen(self.features_to_encode)==0:categories="auto"elifself.parameters["categories"]isnotNone:categories=self.parameters["categories"]iflen(categories)!=len(self.features_to_encode)ornotisinstance(categories[0],list,):raiseValueError("Categories argument must contain a list of categories for each categorical feature",)else:categories=[]forcolinX_t[self.features_to_encode]:value_counts=X_t[col].value_counts(dropna=False).to_frame()ifself.parameters["drop"]=="if_binary"andlen(value_counts)==2:majority_class_value=value_counts.index.tolist()[0]self._binary_values_to_drop.append((col,majority_class_value))iftop_nisNoneorlen(value_counts)<=top_n:unique_values=value_counts.index.tolist()else:value_counts=value_counts.sample(frac=1,random_state=self._initial_state,)value_counts=value_counts.sort_values(value_counts.iloc[:,0].name,ascending=False,kind="mergesort",)unique_values=value_counts.head(top_n).index.tolist()unique_values=np.sort(unique_values)categories.append(unique_values)# Create an encoder to pass off the rest of the computation to# if "drop" is set to "if_binary", pass None to scikit-learn because we manually handledrop_to_use=(Noneifself.parameters["drop"]=="if_binary"elseself.parameters["drop"])self._encoder=SKOneHotEncoder(categories=categories,drop=drop_to_use,handle_unknown=self.parameters["handle_unknown"],)self._encoder.fit(X_t[self.features_to_encode])returnself
[docs]deftransform(self,X,y=None):"""One-hot encode the input data. Args: X (pd.DataFrame): Features to one-hot encode. y (pd.Series): Ignored. Returns: pd.DataFrame: Transformed data, where each categorical feature has been encoded into numerical columns using one-hot encoding. """X=infer_feature_types(X)X_copy=self._handle_parameter_handle_missing(X)X=X.ww.drop(columns=self.features_to_encode)# Call sklearn's transform on the categorical columnsiflen(self.features_to_encode)>0:X_cat=pd.DataFrame(self._encoder.transform(X_copy[self.features_to_encode]).toarray(),index=X_copy.index,)X_cat.columns=self._get_feature_names()X_cat.drop(columns=self._features_to_drop,inplace=True)forcolinX_cat:X_cat[col]=X_cat[col].astype("bool")X_cat.ww.init(logical_types={c:"Boolean"forcinX_cat.columns})self._feature_names=X_cat.columnsX=ww.utils.concat_columns([X,X_cat])returnX
def_handle_parameter_handle_missing(self,X):"""Helper method to handle the `handle_missing` parameter."""cat_cols=self.features_to_encodeif(self.parameters["handle_missing"]=="error"andX[self.features_to_encode].isnull().any().any()):raiseValueError("Input contains NaN")ifself.parameters["handle_missing"]=="as_category":forcolincat_cols:ifX[col].dtype=="category"andpd.isna(X[col]).any():X[col]=X[col].cat.add_categories("nan")X[col]=X[col].where(~pd.isna(X[col]),other="nan")X[col]=X[col].replace(np.nan,"nan")returnX
[docs]defcategories(self,feature_name):"""Returns a list of the unique categories to be encoded for the particular feature, in order. Args: feature_name (str): The name of any feature provided to one-hot encoder during fit. Returns: np.ndarray: The unique categories, in the same dtype as they were provided during fit. Raises: ValueError: If feature was not provided to one-hot encoder as a training feature. """try:index=self.features_to_encode.index(feature_name)exceptException:raiseValueError(f'Feature "{feature_name}" was not provided to one-hot encoder as a training feature',)returnself._encoder.categories_[index]
@staticmethoddef_make_name_unique(name,seen_before):"""Helper to make the name unique."""ifnamenotinseen_before:returnname# Only modify the name if it has been seen beforei=1name=f"{name}_{i}"whilenameinseen_before:name=f"{name[:name.rindex('_')]}_{i}"i+=1returnnamedef_get_feature_names(self):"""Return feature names for the categorical features after fitting, before the majority class for binary encoded features are dropped. Feature names are formatted as {column name}_{category name}. In the event of a duplicate name, an integer will be added at the end of the feature name to distinguish it. For example, consider a dataframe with a column called "A" and category "x_y" and another column called "A_x" with "y". In this example, the feature names would be "A_x_y" and "A_x_y_1". Returns: np.ndarray: The feature names after encoding, provided in the same order as input_features. """self._features_to_drop=[]unique_names=[]seen_before=set([])provenance={}forcol_index,colinenumerate(self.features_to_encode):column_categories=self.categories(col)unique_encoded_columns=[]encoded_features_to_drop=[]forcat_index,categoryinenumerate(column_categories):# Drop categories specified by the userif(self._encoder.drop_idx_isnotNoneandself._encoder.drop_idx_[col_index]isnotNone):ifcat_index==self._encoder.drop_idx_[col_index]:continue# Follow sklearn naming convention but if name has been seen before# then add an int to make it uniqueproposed_name=self._make_name_unique(f"{col}_{category}",seen_before)if(col,category)inself._binary_values_to_drop:encoded_features_to_drop.append(proposed_name)unique_names.append(proposed_name)unique_encoded_columns.append(proposed_name)seen_before.add(proposed_name)self._features_to_drop.extend(encoded_features_to_drop)unique_encoded_columns_without_dropped=unique_encoded_columnsforfeature_to_dropinencoded_features_to_drop:unique_encoded_columns_without_dropped.remove(feature_to_drop)provenance[col]=unique_encoded_columns_without_droppedself._provenance=provenancereturnunique_names
[docs]defget_feature_names(self):"""Return feature names for the categorical features after fitting. Feature names are formatted as {column name}_{category name}. In the event of a duplicate name, an integer will be added at the end of the feature name to distinguish it. For example, consider a dataframe with a column called "A" and category "x_y" and another column called "A_x" with "y". In this example, the feature names would be "A_x_y" and "A_x_y_1". Returns: np.ndarray: The feature names after encoding, provided in the same order as input_features. """feature_names=self._get_feature_names()forfeature_nameinself._features_to_drop:feature_names.remove(feature_name)returnfeature_names