Source code for evalml.pipelines.components.transformers.samplers.base_sampler

"""Base Sampler component. Used as the base class of all sampler components."""
import copy
from abc import abstractmethod

from woodwork.logical_types import IntegerNullable

from evalml.pipelines.components.transformers import Transformer
from evalml.utils.woodwork_utils import infer_feature_types


[docs]class BaseSampler(Transformer): """Base Sampler component. Used as the base class of all sampler components. Args: parameters (dict): Dictionary of parameters for the component. Defaults to None. component_obj (obj): Third-party objects useful in component implementation. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. """ modifies_features = True modifies_target = True training_only = True
[docs] def fit(self, X, y): """Fits the sampler to the data. Args: X (pd.DataFrame): Input features. y (pd.Series): Target. Returns: self Raises: ValueError: If y is None. """ if y is None: raise ValueError("y cannot be None") X_ww, y_ww = self._prepare_data(X, y) self._initialize_sampler(X_ww, y_ww) return self
@abstractmethod def _initialize_sampler(self, X, y): """Helper function to initialize the sampler component object. Args: X (pd.DataFrame): Features. y (pd.Series): The target data. """ def _prepare_data(self, X, y): """Transforms the input data to pandas data structure that our sampler can ingest. Args: X (pd.DataFrame): Training features. y (pd.Series): Target. Returns: pd.DataFrame, pd.Series: Prepared X and y data as pandas types """ X = infer_feature_types(X) int_nullable_cols = X.ww.select(IntegerNullable).columns if len(int_nullable_cols) > 0: try: X = X.astype( { null_col: int for null_col in X.ww.select(IntegerNullable).columns }, ) except ValueError: X = X.astype( { null_col: float for null_col in X.ww.select(IntegerNullable).columns }, ) X.ww.init(schema=X.ww.schema) if y is None: raise ValueError("y cannot be None") y = infer_feature_types(y) return X, y
[docs] def transform(self, X, y=None): """Transforms the input data by sampling the data. Args: X (pd.DataFrame): Training features. y (pd.Series): Target. Returns: pd.DataFrame, pd.Series: Transformed features and target. """ X, y = self._prepare_data(X, y) categorical_columns = X.ww.select("Categorical", return_schema=True).columns for col in categorical_columns: X[col] = X[col].astype("object") X_new, y_new = self._component_obj.fit_resample(X, y) for col in categorical_columns: X[col] = X[col].astype("category") X_new.ww.init(schema=X.ww.schema) y_new.ww.init(schema=y.ww.schema) return X_new, y_new
def _convert_dictionary(self, sampling_dict, y): """Converts the provided sampling dictionary from a dictionary of ratios to a dictionary of number of samples. Expects the provided dictionary keys to be the target values y, and the associated values to be the min:max ratios. Converts and returns a dictionary with the same keys, but changes the values to be the number of samples rather than ratio. Args: sampling_dict (dict): The input sampling dictionary passed in from user. y (pd.Series): The target values. Returns: dict: A dictionary with target values as keys and the number of samples as values. """ # check that the lengths of the dict and y are equal y_unique = y.unique() if len(sampling_dict) != len(y_unique): raise ValueError( "Sampling dictionary contains a different number of targets than are provided in the data.", ) if len(set(sampling_dict.keys()).intersection(set(y_unique))) != len(y_unique): raise ValueError("Dictionary keys are different from target values!") new_dic = {} y_counts = y.value_counts() for k, v in sampling_dict.items(): # turn the ratios into sampler values if self.__class__.__name__ == "Undersampler": # for undersampling, we make sure we never sample more than the # total samples for that class new_dic[k] = int(min(y_counts.values[-1] / v, y_counts[k])) else: # for oversampling, we need to make sure we never sample less than # the total samples for that class new_dic[k] = int(max(y_counts.values[0] * v, y_counts[k])) return new_dic def _dictionary_to_params(self, sampling_dict, y): """If a sampling ratio dictionary is provided, add the updated sampling dictionary to the parameters and return the updated parameter dictionary. Otherwise, simply return the current parameters. Args: sampling_dict (dict): The input sampling dictionary passed in from user. y (pd.Series): The target values. Returns: dict: The parameters dictionary with the sampling_ratio_dict value replaced as necessary. """ param_copy = copy.copy(self.parameters) if self.parameters["sampling_ratio_dict"]: new_dic = self._convert_dictionary( self.parameters["sampling_ratio_dict"], y, ) param_copy["sampling_ratio_dict"] = new_dic return param_copy
[docs] def fit_transform(self, X, y): """Fit and transform data using the sampler component. Args: X (pd.DataFrame): The input training data of shape [n_samples, n_features]. y (pd.Series, optional): The target training data of length [n_samples]. Returns: (pd.DataFrame, pd.Series): Transformed data. """ return self.fit(X, y).transform(X, y)