Source code for evalml.data_checks.uniqueness_data_check
"""Data check that checks if there are any columns in the input that are either too unique for classification problems or not unique enough for regression problems."""fromevalml.data_checksimport(DataCheck,DataCheckActionCode,DataCheckActionOption,DataCheckMessageCode,DataCheckWarning,)fromevalml.problem_typesimporthandle_problem_types,is_multiclass,is_regressionfromevalml.utils.woodwork_utilsimportinfer_feature_typeswarning_not_unique_enough=("Input columns {} for {} problem type are not unique enough.")warning_too_unique="Input columns {} for {} problem type are too unique."
[docs]classUniquenessDataCheck(DataCheck):"""Check if there are any columns in the input that are either too unique for classification problems or not unique enough for regression problems. Args: problem_type (str or ProblemTypes): The specific problem type to data check for. e.g. 'binary', 'multiclass', 'regression, 'time series regression' threshold(float): The threshold to set as an upper bound on uniqueness for classification type problems or lower bound on for regression type problems. Defaults to 0.50. """def__init__(self,problem_type,threshold=0.50):self.problem_type=handle_problem_types(problem_type)ifthreshold<0orthreshold>1:raiseValueError("threshold must be a float between 0 and 1, inclusive.")self.threshold=threshold
[docs]defvalidate(self,X,y=None):"""Check if there are any columns in the input that are too unique in the case of classification problems or not unique enough in the case of regression problems. Args: X (pd.DataFrame, np.ndarray): Features. y (pd.Series, np.ndarray): Ignored. Defaults to None. Returns: dict: dict with a DataCheckWarning if there are any too unique or not unique enough columns. Examples: >>> import pandas as pd Because the problem type is regression, the column "regression_not_unique_enough" raises a warning for having just one value. >>> df = pd.DataFrame({ ... "regression_unique_enough": [float(x) for x in range(100)], ... "regression_not_unique_enough": [float(1) for x in range(100)] ... }) ... >>> uniqueness_check = UniquenessDataCheck(problem_type="regression", threshold=0.8) >>> assert uniqueness_check.validate(df) == [ ... { ... "message": "Input columns 'regression_not_unique_enough' for regression problem type are not unique enough.", ... "data_check_name": "UniquenessDataCheck", ... "level": "warning", ... "code": "NOT_UNIQUE_ENOUGH", ... "details": {"columns": ["regression_not_unique_enough"], "uniqueness_score": {"regression_not_unique_enough": 0.0}, "rows": None}, ... "action_options": [ ... { ... "code": "DROP_COL", ... "parameters": {}, ... "data_check_name": "UniquenessDataCheck", ... "metadata": {"columns": ["regression_not_unique_enough"], "rows": None} ... } ... ] ... } ... ] For multiclass, the column "regression_unique_enough" has too many unique values and will raise an appropriate warning. >>> y = pd.Series([1, 1, 1, 2, 2, 3, 3, 3]) >>> uniqueness_check = UniquenessDataCheck(problem_type="multiclass", threshold=0.8) >>> assert uniqueness_check.validate(df) == [ ... { ... "message": "Input columns 'regression_unique_enough' for multiclass problem type are too unique.", ... "data_check_name": "UniquenessDataCheck", ... "level": "warning", ... "details": { ... "columns": ["regression_unique_enough"], ... "rows": None, ... "uniqueness_score": {"regression_unique_enough": 0.99} ... }, ... "code": "TOO_UNIQUE", ... "action_options": [ ... { ... "code": "DROP_COL", ... "data_check_name": "UniquenessDataCheck", ... "parameters": {}, ... "metadata": {"columns": ["regression_unique_enough"], "rows": None} ... } ... ] ... } ... ] ... >>> assert UniquenessDataCheck.uniqueness_score(y) == 0.65625 """messages=[]X=infer_feature_types(X)res=X.apply(UniquenessDataCheck.uniqueness_score)ifis_regression(self.problem_type):not_unique_enough_cols=list(res.index[res<self.threshold])messages.append(DataCheckWarning(message=warning_not_unique_enough.format((", ").join(["'{}'".format(str(col))forcolinnot_unique_enough_cols],),self.problem_type,),data_check_name=self.name,message_code=DataCheckMessageCode.NOT_UNIQUE_ENOUGH,details={"columns":not_unique_enough_cols,"uniqueness_score":{col:res.loc[col]forcolinnot_unique_enough_cols},},action_options=[DataCheckActionOption(action_code=DataCheckActionCode.DROP_COL,data_check_name=self.name,metadata={"columns":not_unique_enough_cols},),],).to_dict(),)elifis_multiclass(self.problem_type):too_unique_cols=list(res.index[res>self.threshold])messages.append(DataCheckWarning(message=warning_too_unique.format((", ").join(["'{}'".format(str(col))forcolintoo_unique_cols],),self.problem_type,),data_check_name=self.name,message_code=DataCheckMessageCode.TOO_UNIQUE,details={"columns":too_unique_cols,"uniqueness_score":{col:res.loc[col]forcolintoo_unique_cols},},action_options=[DataCheckActionOption(action_code=DataCheckActionCode.DROP_COL,data_check_name=self.name,metadata={"columns":too_unique_cols},),],).to_dict(),)returnmessages
[docs]@staticmethoddefuniqueness_score(col,drop_na=True):"""Calculate a uniqueness score for the provided field. NaN values are not considered as unique values in the calculation. Based on the Herfindahl-Hirschman Index. Args: col (pd.Series): Feature values. drop_na (bool): Whether to drop null values when computing the uniqueness score. Defaults to True. Returns: (float): Uniqueness score. """norm_counts=(col.value_counts(dropna=drop_na)/col.value_counts(dropna=drop_na).sum())square_counts=norm_counts**2score=1-square_counts.sum()returnscore