Source code for evalml.data_checks.sparsity_data_check
"""Data check that checks if there are any columns with sparsely populated values in the input."""fromevalml.data_checksimport(DataCheck,DataCheckActionCode,DataCheckActionOption,DataCheckMessageCode,DataCheckWarning,)fromevalml.problem_typesimporthandle_problem_types,is_multiclassfromevalml.utils.woodwork_utilsimportinfer_feature_typeswarning_too_unique="Input columns ({}) for {} problem type are too sparse."
[docs]classSparsityDataCheck(DataCheck):"""Check if there are any columns with sparsely populated values in the input. Args: problem_type (str or ProblemTypes): The specific problem type to data check for. 'multiclass' or 'time series multiclass' is the only accepted problem type. threshold (float): The threshold value, or percentage of each column's unique values, below which, a column exhibits sparsity. Should be between 0 and 1. unique_count_threshold (int): The minimum number of times a unique value has to be present in a column to not be considered "sparse." Defaults to 10. """def__init__(self,problem_type,threshold,unique_count_threshold=10):self.problem_type=handle_problem_types(problem_type)ifnotis_multiclass(self.problem_type):raiseValueError("Sparsity is only defined for multiclass problem types.")self.threshold=thresholdifthreshold<0orthreshold>1:raiseValueError("Threshold must be a float between 0 and 1, inclusive.")self.unique_count_threshold=unique_count_thresholdifunique_count_threshold<0ornotisinstance(unique_count_threshold,int):raiseValueError("Unique count threshold must be positive integer.")
[docs]defvalidate(self,X,y=None):"""Calculate what percentage of each column's unique values exceed the count threshold and compare that percentage to the sparsity threshold stored in the class instance. Args: X (pd.DataFrame, np.ndarray): Features. y (pd.Series, np.ndarray): Ignored. Returns: dict: dict with a DataCheckWarning if there are any sparse columns. Examples: >>> import pandas as pd For multiclass problems, if a column doesn't have enough representation from unique values, it will be considered sparse. >>> df = pd.DataFrame({ ... "sparse": [float(x) for x in range(100)], ... "not_sparse": [float(1) for x in range(100)] ... }) ... >>> sparsity_check = SparsityDataCheck(problem_type="multiclass", threshold=0.5, unique_count_threshold=10) >>> assert sparsity_check.validate(df) == [ ... { ... "message": "Input columns ('sparse') for multiclass problem type are too sparse.", ... "data_check_name": "SparsityDataCheck", ... "level": "warning", ... "code": "TOO_SPARSE", ... "details": { ... "columns": ["sparse"], ... "sparsity_score": {"sparse": 0.0}, ... "rows": None ... }, ... "action_options": [ ... { ... "code": "DROP_COL", ... "data_check_name": "SparsityDataCheck", ... "parameters": {}, ... "metadata": {"columns": ["sparse"], "rows": None} ... } ... ] ... } ... ] ... >>> df["sparse"] = [float(x % 10) for x in range(100)] >>> sparsity_check = SparsityDataCheck(problem_type="multiclass", threshold=1, unique_count_threshold=5) >>> assert sparsity_check.validate(df) == [] ... >>> sparse_array = pd.Series([1, 1, 1, 2, 2, 3] * 3) >>> assert SparsityDataCheck.sparsity_score(sparse_array, count_threshold=5) == 0.6666666666666666 """messages=[]X=infer_feature_types(X)res=X.apply(SparsityDataCheck.sparsity_score,count_threshold=self.unique_count_threshold,)too_sparse_cols=[colforcolinres.index[res<self.threshold]]iftoo_sparse_cols:messages.append(DataCheckWarning(message=warning_too_unique.format((", ").join(["'{}'".format(str(col))forcolintoo_sparse_cols],),self.problem_type,),data_check_name=self.name,message_code=DataCheckMessageCode.TOO_SPARSE,details={"columns":too_sparse_cols,"sparsity_score":{col:res.loc[col]forcolintoo_sparse_cols},},action_options=[DataCheckActionOption(DataCheckActionCode.DROP_COL,data_check_name=self.name,metadata={"columns":too_sparse_cols},),],).to_dict(),)returnmessages
[docs]@staticmethoddefsparsity_score(col,count_threshold=10):"""Calculate a sparsity score for the given value counts by calculating the percentage of unique values that exceed the count_threshold. Args: col (pd.Series): Feature values. count_threshold (int): The number of instances below which a value is considered sparse. Default is 10. Returns: (float): Sparsity score, or the percentage of the unique values that exceed count_threshold. """counts=col.value_counts()score=sum(counts>count_threshold)/counts.sizereturnscore