Source code for evalml.data_checks.no_variance_data_check

"""Data check that checks if the target or any of the features have no variance."""
from evalml.data_checks import (
    DataCheck,
    DataCheckAction,
    DataCheckActionCode,
    DataCheckError,
    DataCheckMessageCode,
    DataCheckWarning,
)
from evalml.utils import infer_feature_types


[docs]class NoVarianceDataCheck(DataCheck):
    """Check if the target or any of the features have no variance.

    Args:
        count_nan_as_value (bool): If True, missing values will be counted as their own unique value.
            Additionally, if true, will return a DataCheckWarning instead of an error
            if the feature has mostly missing data and only one unique value.
            Defaults to False.
    """

    def __init__(self, count_nan_as_value=False):
        self._dropnan = not count_nan_as_value

[docs]    def validate(self, X, y):
        """Check if the target or any of the features have no variance (1 unique value).

        Args:
            X (pd.DataFrame, np.ndarray): The input features.
            y (pd.Series, np.ndarray): The target data.

        Returns:
            dict: A dict of warnings/errors corresponding to features or target with no variance.

        Examples:
            >>> import pandas as pd

            Columns or target data that have only one unique value will raise an error.

            >>> X = pd.DataFrame([2, 2, 2, 2, 2, 2, 2, 2], columns=["First_Column"])
            >>> y = pd.Series([1, 1, 1, 1, 1, 1, 1, 1])
            ...
            >>> novar_dc = NoVarianceDataCheck()
            >>> assert novar_dc.validate(X, y) == {
            ...     'warnings': [],
            ...     'errors': [{'message': "'First_Column' has 1 unique value.",
            ...                 'data_check_name': 'NoVarianceDataCheck',
            ...                 'level': 'error',
            ...                 'details': {'columns': ['First_Column'], 'rows': None},
            ...                 'code': 'NO_VARIANCE'},
            ...                {'message': 'Y has 1 unique value.',
            ...                 'data_check_name': 'NoVarianceDataCheck',
            ...                 'level': 'error',
            ...                 'details': {'columns': ['Y'], 'rows': None},
            ...                 'code': 'NO_VARIANCE'}],
            ...     'actions': [{'code': 'DROP_COL',
            ...                  'data_check_name': 'NoVarianceDataCheck',
            ...                  'metadata': {'columns': ["First_Column"], 'rows': None}}]}

            By default, NaNs will not be counted as distinct values. In the first example, there are still two distinct values
            besides None. In the second, there are no distinct values as the target is entirely null.

            >>> X["First_Column"] = [2, 2, 2, 3, 3, 3, None, None]
            >>> y = pd.Series([1, 1, 1, 2, 2, 2, None, None])
            >>> assert novar_dc.validate(X, y) == {'warnings': [], 'errors': [], 'actions': []}
            ...
            ...
            >>> y = pd.Series([None] * 7)
            >>> assert novar_dc.validate(X, y) == {
            ...     'warnings': [],
            ...     'errors': [{'message': 'Y has 0 unique values.',
            ...                 'data_check_name': 'NoVarianceDataCheck',
            ...                 'level': 'error',
            ...                 'details': {'columns': ['Y'], 'rows': None},
            ...                 'code': 'NO_VARIANCE'}],
            ...     'actions': []}

            As None is not considered a distinct value by default, there is only one unique value in X and y.

            >>> X["First_Column"] = [2, 2, 2, 2, None, None, None, None]
            >>> y = pd.Series([1, 1, 1, 1, None, None, None, None])
            >>> assert novar_dc.validate(X, y) == {
            ...     'warnings': [],
            ...     'errors': [{'message': "'First_Column' has 1 unique value.",
            ...                 'data_check_name': 'NoVarianceDataCheck',
            ...                 'level': 'error',
            ...                 'details': {'columns': ['First_Column'], 'rows': None},
            ...                 'code': 'NO_VARIANCE'},
            ...                {'message': 'Y has 1 unique value.',
            ...                 'data_check_name': 'NoVarianceDataCheck',
            ...                 'level': 'error',
            ...                 'details': {'columns': ['Y'], 'rows': None},
            ...                 'code': 'NO_VARIANCE'}],
            ...     'actions': [{'code': 'DROP_COL',
            ...                  'data_check_name': 'NoVarianceDataCheck',
            ...                  'metadata': {'columns': ['First_Column'], 'rows': None}}]}

            If count_nan_as_value is set to True, then NaNs are counted as unique values. In the event that there is an
            adequate number of unique values only because count_nan_as_value is set to True, a warning will be raised so
            the user can encode these values.

            >>> novar_dc = NoVarianceDataCheck(count_nan_as_value=True)
            >>> assert novar_dc.validate(X, y) == {
            ...     'warnings': [{'message': "'First_Column' has two unique values including nulls. Consider encoding the nulls for this column to be useful for machine learning.",
            ...                   'data_check_name': 'NoVarianceDataCheck',
            ...                   'level': 'warning',
            ...                   'details': {'columns': ['First_Column'], 'rows': None},
            ...                   'code': 'NO_VARIANCE_WITH_NULL'},
            ...                  {'message': 'Y has two unique values including nulls. Consider encoding the nulls for this column to be useful for machine learning.',
            ...                   'data_check_name': 'NoVarianceDataCheck',
            ...                   'level': 'warning',
            ...                   'details': {'columns': ['Y'], 'rows': None},
            ...                   'code': 'NO_VARIANCE_WITH_NULL'}],
            ...     'errors': [],
            ...     'actions': [{'code': 'DROP_COL',
            ...                  'data_check_name': 'NoVarianceDataCheck',
            ...                  'metadata': {'columns': ['First_Column'], 'rows': None}}]}

        """
        results = {"warnings": [], "errors": [], "actions": []}
        X = infer_feature_types(X, ignore_nullable_types=True)
        y = infer_feature_types(y, ignore_nullable_types=True)

        unique_counts = X.nunique(dropna=self._dropnan).to_dict()
        any_nulls = (X.isnull().any()).to_dict()
        one_unique = []
        one_unique_with_null = []
        zero_unique = []
        for col_name in unique_counts:
            count_unique = unique_counts[col_name]
            has_any_nulls = any_nulls[col_name]
            if count_unique == 0:
                zero_unique.append(col_name)
            elif count_unique == 1:
                one_unique.append(col_name)
            elif count_unique == 2 and not self._dropnan and has_any_nulls:
                one_unique_with_null.append(col_name)

        zero_unique_message = "{} has 0 unique values."
        one_unique_message = "{} has 1 unique value."
        two_unique_with_null_message = "{} has two unique values including nulls. Consider encoding the nulls for this column to be useful for machine learning."
        if zero_unique:
            DataCheck._add_message(
                DataCheckError(
                    message=zero_unique_message.format(
                        (", ").join(["'{}'".format(str(col)) for col in zero_unique]),
                    ),
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.NO_VARIANCE,
                    details={"columns": zero_unique},
                ),
                results,
            )
        if one_unique:
            DataCheck._add_message(
                DataCheckError(
                    message=one_unique_message.format(
                        (", ").join(["'{}'".format(str(col)) for col in one_unique]),
                    ),
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.NO_VARIANCE,
                    details={"columns": one_unique},
                ),
                results,
            )
        if one_unique_with_null:
            DataCheck._add_message(
                DataCheckWarning(
                    message=two_unique_with_null_message.format(
                        (", ").join(
                            ["'{}'".format(str(col)) for col in one_unique_with_null]
                        ),
                    ),
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.NO_VARIANCE_WITH_NULL,
                    details={"columns": one_unique_with_null},
                ),
                results,
            )
        all_cols = zero_unique + one_unique + one_unique_with_null
        if all_cols:
            results["actions"].append(
                DataCheckAction(
                    DataCheckActionCode.DROP_COL,
                    data_check_name=self.name,
                    metadata={"columns": all_cols},
                ).to_dict()
            )

        # Check target for variance
        y_name = getattr(y, "name")
        if not y_name:
            y_name = "Y"

        y_unique_count = y.nunique(dropna=self._dropnan)
        y_any_null = y.isnull().any()

        if y_unique_count == 0:
            DataCheck._add_message(
                DataCheckError(
                    message=zero_unique_message.format(y_name),
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.NO_VARIANCE,
                    details={"columns": [y_name]},
                ),
                results,
            )

        elif y_unique_count == 1:
            DataCheck._add_message(
                DataCheckError(
                    message=one_unique_message.format(y_name),
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.NO_VARIANCE,
                    details={"columns": [y_name]},
                ),
                results,
            )

        elif y_unique_count == 2 and not self._dropnan and y_any_null:
            DataCheck._add_message(
                DataCheckWarning(
                    message=two_unique_with_null_message.format(y_name),
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.NO_VARIANCE_WITH_NULL,
                    details={"columns": [y_name]},
                ),
                results,
            )

        return results