Source code for evalml.data_checks.uniqueness_data_check

from evalml.data_checks import (
    DataCheck,
    DataCheckAction,
    DataCheckActionCode,
    DataCheckMessageCode,
    DataCheckWarning,
)
from evalml.problem_types import (
    handle_problem_types,
    is_multiclass,
    is_regression,
)
from evalml.utils.woodwork_utils import infer_feature_types

warning_not_unique_enough = (
    "Input columns ({}) for {} problem type are not unique enough."
)
warning_too_unique = "Input columns ({}) for {} problem type are too unique."


[docs]class UniquenessDataCheck(DataCheck):
    """Checks if there are any columns in the input that are either too unique for classification problems
    or not unique enough for regression problems.

    Arguments:
        problem_type (str or ProblemTypes): The specific problem type to data check for.
            e.g. 'binary', 'multiclass', 'regression, 'time series regression'
        threshold(float): The threshold to set as an upper bound on uniqueness for classification type problems
            or lower bound on for regression type problems.  Defaults to 0.50.

    """

    def __init__(self, problem_type, threshold=0.50):
        self.problem_type = handle_problem_types(problem_type)
        if threshold < 0 or threshold > 1:
            raise ValueError("threshold must be a float between 0 and 1, inclusive.")
        self.threshold = threshold

[docs]    def validate(self, X, y=None):
        """Checks if there are any columns in the input that are too unique in the case of classification
        problems or not unique enough in the case of regression problems.

        Arguments:
            X (pd.DataFrame, np.ndarray): Features.
            y (pd.Series, np.ndarray): Ignored.  Defaults to None.

        Returns:
            dict: dict with a DataCheckWarning if there are any too unique or not
                unique enough columns.

        Example:
            >>> import pandas as pd
            >>> df = pd.DataFrame({
            ...    'regression_unique_enough': [float(x) for x in range(100)],
            ...    'regression_not_unique_enough': [float(1) for x in range(100)]
            ... })
            >>> uniqueness_check = UniquenessDataCheck(problem_type="regression", threshold=0.8)
            >>> assert uniqueness_check.validate(df) == {"errors": [],\
                                                         "warnings": [{"message": "Input columns (regression_not_unique_enough) for regression problem type are not unique enough.",\
                                                                 "data_check_name": "UniquenessDataCheck",\
                                                                 "level": "warning",\
                                                                 "code": "NOT_UNIQUE_ENOUGH",\
                                                                 "details": {"column": "regression_not_unique_enough", 'uniqueness_score': 0.0}}],\
                                                         "actions": [{"code": "DROP_COL",\
                                                                      "metadata": {"column": "regression_not_unique_enough"}}]}
        """
        results = {"warnings": [], "errors": [], "actions": []}

        X = infer_feature_types(X)

        res = X.apply(UniquenessDataCheck.uniqueness_score)

        if is_regression(self.problem_type):
            not_unique_enough_cols = list(res.index[res < self.threshold])
            results["warnings"].extend(
                [
                    DataCheckWarning(
                        message=warning_not_unique_enough.format(
                            col_name, self.problem_type
                        ),
                        data_check_name=self.name,
                        message_code=DataCheckMessageCode.NOT_UNIQUE_ENOUGH,
                        details={
                            "column": col_name,
                            "uniqueness_score": res.loc[col_name],
                        },
                    ).to_dict()
                    for col_name in not_unique_enough_cols
                ]
            )
            results["actions"].extend(
                [
                    DataCheckAction(
                        action_code=DataCheckActionCode.DROP_COL,
                        metadata={"column": col_name},
                    ).to_dict()
                    for col_name in not_unique_enough_cols
                ]
            )
        elif is_multiclass(self.problem_type):
            too_unique_cols = list(res.index[res > self.threshold])
            results["warnings"].extend(
                [
                    DataCheckWarning(
                        message=warning_too_unique.format(col_name, self.problem_type),
                        data_check_name=self.name,
                        message_code=DataCheckMessageCode.TOO_UNIQUE,
                        details={
                            "column": col_name,
                            "uniqueness_score": res.loc[col_name],
                        },
                    ).to_dict()
                    for col_name in too_unique_cols
                ]
            )
            results["actions"].extend(
                [
                    DataCheckAction(
                        action_code=DataCheckActionCode.DROP_COL,
                        metadata={"column": col_name},
                    ).to_dict()
                    for col_name in too_unique_cols
                ]
            )
        return results

[docs]    @staticmethod
    def uniqueness_score(col):
        """This function calculates a uniqueness score for the provided field.  NaN values are
        not considered as unique values in the calculation.

        Based on the Herfindahl–Hirschman Index.

        Arguments:
            col (pd.Series): Feature values.
        Returns:
            (float): Uniqueness score.
        """
        norm_counts = col.value_counts() / col.value_counts().sum()
        square_counts = norm_counts ** 2
        score = 1 - square_counts.sum()
        return score