Source code for evalml.data_checks.no_variance_data_check

"""Data check that checks if the target or any of the features have no variance."""
from evalml.data_checks import (
    DataCheck,
    DataCheckAction,
    DataCheckActionCode,
    DataCheckError,
    DataCheckMessageCode,
    DataCheckWarning,
)
from evalml.utils import infer_feature_types


[docs]class NoVarianceDataCheck(DataCheck): """Check if the target or any of the features have no variance. Args: count_nan_as_value (bool): If True, missing values will be counted as their own unique value. Additionally, if true, will return a DataCheckWarning instead of an error if the feature has mostly missing data and only one unique value. Defaults to False. """ def __init__(self, count_nan_as_value=False): self._dropnan = not count_nan_as_value def _check_for_errors(self, column_name, count_unique, any_nulls): """Check if a column has no variance. Args: column_name (str): Name of the column we are checking. count_unique (float): Number of unique values in this column. any_nulls (bool): Whether this column has any missing data. Returns: DataCheckError if the column has no variance or DataCheckWarning if the column has two unique values including NaN. """ message = f"{column_name} has {int(count_unique)} unique value." if count_unique <= 1: return DataCheckError( message=message.format(name=column_name), data_check_name=self.name, message_code=DataCheckMessageCode.NO_VARIANCE, details={"column": column_name}, ) elif count_unique == 2 and not self._dropnan and any_nulls: return DataCheckWarning( message=f"{column_name} has two unique values including nulls. " "Consider encoding the nulls for " "this column to be useful for machine learning.", data_check_name=self.name, message_code=DataCheckMessageCode.NO_VARIANCE_WITH_NULL, details={"column": column_name}, )
[docs] def validate(self, X, y): """Check if the target or any of the features have no variance (1 unique value). Args: X (pd.DataFrame, np.ndarray): The input features. y (pd.Series, np.ndarray): The target data. Returns: dict: dict of warnings/errors corresponding to features or target with no variance. """ results = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) y = infer_feature_types(y) unique_counts = X.nunique(dropna=self._dropnan).to_dict() any_nulls = (X.isnull().any()).to_dict() for col_name in unique_counts: message = self._check_for_errors( col_name, unique_counts[col_name], any_nulls[col_name] ) if not message: continue DataCheck._add_message(message, results) results["actions"].append( DataCheckAction( DataCheckActionCode.DROP_COL, metadata={"column": col_name} ).to_dict() ) y_name = getattr(y, "name") if not y_name: y_name = "Y" target_message = self._check_for_errors( y_name, y.nunique(dropna=self._dropnan), y.isnull().any() ) if target_message: DataCheck._add_message(target_message, results) return results