Source code for evalml.pipelines.components.transformers.encoders.onehot_encoder


import numpy as np
import pandas as pd

from .encoder import CategoricalEncoder


[docs]class OneHotEncoder(CategoricalEncoder): """One-hot encoder to encode non-numeric data""" name = 'One Hot Encoder' hyperparameter_ranges = {}
[docs] def __init__(self, top_n=10, random_state=0): """Initalizes self.""" parameters = {"top_n": top_n} super().__init__(parameters=parameters, component_obj=None, random_state=random_state)
def _get_cat_cols(self, X): """Get names of 'object' or 'categorical' columns in the DataFrame.""" obj_cols = [] for idx, dtype in enumerate(X.dtypes): if dtype == np.object or pd.api.types.is_categorical_dtype(dtype): obj_cols.append(X.columns.values[idx]) return obj_cols
[docs] def fit(self, X, y=None): top_n = self.parameters['top_n'] if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) X_t = X cols_to_encode = self._get_cat_cols(X_t) self.col_unique_values = {} for col in X_t.columns: if col in cols_to_encode: value_counts = X_t[col].value_counts(dropna=False).to_frame() if len(value_counts) <= top_n: unique_values = value_counts.index.tolist() else: value_counts = value_counts.sample(frac=1, random_state=self.random_state) value_counts = value_counts.sort_values([col], ascending=False, kind='mergesort') unique_values = value_counts.head(top_n).index.tolist() self.col_unique_values[col] = unique_values return self
[docs] def transform(self, X, y=None): """One-hot encode the input DataFrame. Arguments: X (pd.DataFrame): Dataframe of features. y (pd.Series): Ignored. Returns: Transformed dataframe, where each categorical feature has been encoded into numerical columns using one-hot encoding. """ try: col_values = self.col_unique_values except AttributeError: raise RuntimeError("You must fit one hot encoder before calling transform!") if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) X_t = pd.DataFrame() for col in X.columns: if col in col_values: unique = col_values[col] for label in unique: new_name = str(col) + "_" + str(label) add = (X[col] == label).astype(int) add = add.rename(new_name) X_t = pd.concat([X_t, add], axis=1) else: X_t = pd.concat([X_t, X[col]], axis=1) return X_t