Source code for evalml.data_checks.no_variance_data_check

import pandas as pd

from .data_check import DataCheck
from .data_check_message import DataCheckError, DataCheckWarning

from evalml.utils.logger import get_logger

logger = get_logger(__file__)


[docs]class NoVarianceDataCheck(DataCheck):
    """Check if any of the features or labels have no variance."""

[docs]    def __init__(self, count_nan_as_value=False):
        """Check if any of the features or labels have no variance.

        Arguments:
            count_nan_as_value (bool): If True, missing values will be counted as their own unique value.
                If set to True, a feature that has one unique value and all other data is missing, a
                DataCheckWarning will be returned instead of an error. Defaults to False.
        """
        self._dropnan = not count_nan_as_value

    def _check_for_errors(self, column_name, count_unique, any_nulls):
        """Checks if a column has no variance.

        Arguments:
            column_name (str): Name of the column we are checking.
            count_unique (float): Number of unique values in this column.
            any_nulls (bool): Whether this column has any missing data.

        Returns:
            DataCheckError if the column has no variance.
        """
        message = f"{column_name} has {int(count_unique)} unique value."

        if count_unique <= 1:
            return DataCheckError(message.format(name=column_name), self.name)

        elif count_unique == 2 and not self._dropnan and any_nulls:
            return DataCheckWarning(f"{column_name} has two unique values including nulls. "
                                    "Consider encoding the nulls for "
                                    "this column to be useful for machine learning.", self.name)

[docs]    def validate(self, X, y):
        """Check if any of the features or if the labels have no variance (1 unique value).

        Arguments:
            X (pd.DataFrame): The input features.
            y (pd.Series): The labels.

        Returns:
            list (DataCheckWarning or DataCheckError), list of warnings/errors corresponding to features or labels with no variance.
        """
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)

        if not isinstance(y, pd.Series):
            y = pd.Series(y)

        unique_counts = X.nunique(dropna=self._dropnan).to_dict()
        any_nulls = (X.isnull().any()).to_dict()

        messages = []

        for name in unique_counts:
            message = self._check_for_errors(name, unique_counts[name], any_nulls[name])

            if message:
                messages.append(message)

        y_name = getattr(y, "name")
        if not y_name:
            y_name = "Y"

        label_message = self._check_for_errors(y_name, y.nunique(dropna=self._dropnan), y.isnull().any())

        if label_message:
            messages.append(label_message)

        return messages