Source code for evalml.pipelines.components.transformers.samplers.undersampler
"""An undersampling transformer to downsample the majority classes in the dataset."""importnumpyasnpimportpandasaspdfromevalml.pipelines.components.transformers.samplers.base_samplerimportBaseSamplerfromevalml.utils.woodwork_utilsimportinfer_feature_types
[docs]classUndersampler(BaseSampler):"""Initializes an undersampling transformer to downsample the majority classes in the dataset. This component is only run during training and not during predict. Args: sampling_ratio (float): The smallest minority:majority ratio that is accepted as 'balanced'. For instance, a 1:4 ratio would be represented as 0.25, while a 1:1 ratio is 1.0. Must be between 0 and 1, inclusive. Defaults to 0.25. sampling_ratio_dict (dict): A dictionary specifying the desired balanced ratio for each target value. For instance, in a binary case where class 1 is the minority, we could specify: `sampling_ratio_dict={0: 0.5, 1: 1}`, which means we would undersample class 0 to have twice the number of samples as class 1 (minority:majority ratio = 0.5), and don't sample class 1. Overrides sampling_ratio if provided. Defaults to None. min_samples (int): The minimum number of samples that we must have for any class, pre or post sampling. If a class must be downsampled, it will not be downsampled past this value. To determine severe imbalance, the minority class must occur less often than this and must have a class ratio below min_percentage. Must be greater than 0. Defaults to 100. min_percentage (float): The minimum percentage of the minimum class to total dataset that we tolerate, as long as it is above min_samples. If min_percentage and min_samples are not met, treat this as severely imbalanced, and we will not resample the data. Must be between 0 and 0.5, inclusive. Defaults to 0.1. random_seed (int): The seed to use for random sampling. Defaults to 0. Raises: ValueError: If sampling_ratio is not in the range (0, 1]. ValueError: If min_sample is not greater than 0. ValueError: If min_percentage is not between 0 and 0.5, inclusive. """name="Undersampler"hyperparameter_ranges={}"""{}"""def__init__(self,sampling_ratio=0.25,sampling_ratio_dict=None,min_samples=100,min_percentage=0.1,random_seed=0,**kwargs,):ifsampling_ratio<=0orsampling_ratio>1:raiseValueError(f"sampling_ratio must be within (0, 1], but received {sampling_ratio}",)ifmin_samples<=0:raiseValueError(f"min_sample must be greater than 0, but received {min_samples}",)ifmin_percentage<=0ormin_percentage>0.5:raiseValueError(f"min_percentage must be between 0 and 0.5, inclusive, but received {min_percentage}",)parameters={"sampling_ratio":sampling_ratio,"min_samples":min_samples,"min_percentage":min_percentage,"sampling_ratio_dict":sampling_ratio_dict,}self.sampling_ratio=sampling_ratioself.min_samples=min_samplesself.min_percentage=min_percentageself.random_seed=random_seedself.sampling_ratio_dict=sampling_ratio_dictor{}parameters.update(kwargs)super().__init__(parameters=parameters,component_obj=None,random_seed=random_seed,)def_initialize_sampler(self,X,y):"""Helper function to initialize the undersampler component object. Args: X (pd.DataFrame): Ignored. y (pd.Series): The target data. """pass
[docs]deftransform(self,X,y=None):"""Transforms the input data by sampling the data. Args: X (pd.DataFrame): Training features. y (pd.Series): Target. Returns: pd.DataFrame, pd.Series: Transformed features and target. """X_ww=infer_feature_types(X)ifyisNone:raiseValueError("y cannot be None")y_ww=infer_feature_types(y)index_df=pd.Series(y_ww.index)indices=self.fit_resample(X_ww,y_ww)train_indices=index_df[index_df.isin(indices)].index.values.tolist()returnX_ww.ww.iloc[train_indices],y_ww.ww.iloc[train_indices]
def_find_ideal_samples(self,y):"""Returns dictionary of examples to drop for each class if we need to resample. Arguments: y (pd.Series): Target data passed in. Returns: dict: Dictionary with undersample target class as key, and number of samples to remove as the value. If we don't need to resample, returns empty dictionary. """counts=y.value_counts()normalized_counts=y.value_counts(normalize=True)minority_class_count=min(normalized_counts)class_ratios=minority_class_count/normalized_counts# if no class ratios are larger than what we consider balanced, then the target is balancedifall(class_ratios>=self.sampling_ratio):return{}# if any classes have less than min_samples counts and are less than min_percentage of the total data,# then it's severely imbalancedifany(counts<self.min_samples)andany(normalized_counts<self.min_percentage,):return{}# otherwise, we are imbalanced enough to perform on thisundersample_classes=counts[class_ratios<=self.sampling_ratio].index.values# find goal size, round it down if it's a floatminority_class=min(counts.values)goal_value=max(int((minority_class/self.sampling_ratio)//1),self.min_samples,)# we don't want to drop less than 0 rowsdrop_values={k:max(0,counts[k]-goal_value)forkinundersample_classes}return{k:vfork,vindrop_values.items()ifv>0}def_sampling_dict_to_remove_dict(self,y):"""Turns the sampling dict input into a dict of samples to remove for each target, similar to the return of _find_ideal_samples. Arguments: y (pd.Series): Training data targets. Returns: (dict): dictionary with undersample target class as key, and number of samples to remove as the value. If we don't need to resample, returns empty dictionary. """y_dict=y.value_counts().to_dict()new_dic={}fork,vinself.sampling_ratio_dict.items():new_dic[k]=max(y_dict[k]-v,0)returnnew_dic
[docs]deffit_resample(self,X,y):"""Resampling technique for this sampler. Arguments: X (pd.DataFrame): Training data to fit and resample. y (pd.Series): Training data targets to fit and resample. Returns: list: Indices to keep for training data. """ifself.parameters["sampling_ratio_dict"]:self.sampling_ratio_dict=self._convert_dictionary(self.parameters["sampling_ratio_dict"],y,)y=infer_feature_types(y)random_state=np.random.RandomState(self.random_seed)iflen(self.sampling_ratio_dict):result=self._sampling_dict_to_remove_dict(y)else:result=self._find_ideal_samples(y)indices_to_drop=[]iflen(result):# iterate through the classes we need to undersample and remove the number of samples we need to removeforkey,valueinresult.items():indices=y.index[y==key].valuesindices_to_remove=random_state.choice(indices,value,replace=False)indices_to_drop.extend(indices_to_remove)# indices of the y seriesoriginal_indices=list(set(y.index.values).difference(set(indices_to_drop)))returnoriginal_indices