Source code for evalml.data_checks.id_columns_data_check

"""Data check that checks if any of the features are likely to be ID columns."""
from evalml.data_checks import (
    DataCheck,
    DataCheckActionCode,
    DataCheckActionOption,
    DataCheckMessageCode,
    DataCheckWarning,
)
from evalml.utils import infer_feature_types


[docs]class IDColumnsDataCheck(DataCheck):
    """Check if any of the features are likely to be ID columns.

    Args:
        id_threshold (float): The probability threshold to be considered an ID column. Defaults to 1.0.
    """

    def __init__(self, id_threshold=1.0):
        if id_threshold < 0 or id_threshold > 1:
            raise ValueError("id_threshold must be a float between 0 and 1, inclusive.")
        self.id_threshold = id_threshold

[docs]    def validate(self, X, y=None):
        """Check if any of the features are likely to be ID columns. Currently performs a number of simple checks.

        Checks performed are:

            - column name is "id"
            - column name ends in "_id"
            - column contains all unique values (and is categorical / integer type)

        Args:
            X (pd.DataFrame, np.ndarray): The input features to check.
            y (pd.Series): The target. Defaults to None. Ignored.

        Returns:
            dict: A dictionary of features with column name or index and their probability of being ID columns

        Examples:
            >>> import pandas as pd

            Columns that end in "_id" and are completely unique are likely to be ID columns.

            >>> df = pd.DataFrame({
            ...     "customer_id": [123, 124, 125, 126, 127],
            ...     "Sales": [10, 42, 31, 51, 61]
            ... })
            ...
            >>> id_col_check = IDColumnsDataCheck()
            >>> assert id_col_check.validate(df) == [
            ...     {
            ...         "message": "Columns 'customer_id' are 100.0% or more likely to be an ID column",
            ...         "data_check_name": "IDColumnsDataCheck",
            ...         "level": "warning",
            ...         "code": "HAS_ID_COLUMN",
            ...         "details": {"columns": ["customer_id"], "rows": None},
            ...         "action_options": [
            ...             {
            ...                 "code": "DROP_COL",
            ...                 "data_check_name": "IDColumnsDataCheck",
            ...                 "parameters": {},
            ...                 "metadata": {"columns": ["customer_id"], "rows": None}
            ...             }
            ...         ]
            ...    }
            ... ]

            Columns named "ID" with all unique values will also be identified as ID columns.

            >>> df = df.rename(columns={"customer_id": "ID"})
            >>> id_col_check = IDColumnsDataCheck()
            >>> assert id_col_check.validate(df) == [
            ...     {
            ...         "message": "Columns 'ID' are 100.0% or more likely to be an ID column",
            ...         "data_check_name": "IDColumnsDataCheck",
            ...         "level": "warning",
            ...         "code": "HAS_ID_COLUMN",
            ...         "details": {"columns": ["ID"], "rows": None},
            ...         "action_options": [
            ...            {
            ...                 "code": "DROP_COL",
            ...                 "data_check_name": "IDColumnsDataCheck",
            ...                 "parameters": {},
            ...                 "metadata": {"columns": ["ID"], "rows": None}
            ...             }
            ...         ]
            ...     }
            ... ]

            Despite being all unique, "Country_Rank" will not be identified as an ID column as id_threshold is set to 1.0
            by default and its name doesn't indicate that it's an ID.

            >>> df = pd.DataFrame({
            ...    "Country_Rank": [1, 2, 3, 4, 5],
            ...    "Sales": ["very high", "high", "high", "medium", "very low"]
            ... })
            ...
            >>> id_col_check = IDColumnsDataCheck()
            >>> assert id_col_check.validate(df) == []


            However lowering the threshold will cause this column to be identified as an ID.

            >>> id_col_check = IDColumnsDataCheck()
            >>> id_col_check = IDColumnsDataCheck(id_threshold=0.95)
            >>> assert id_col_check.validate(df) == [
            ...     {
            ...         "message": "Columns 'Country_Rank' are 95.0% or more likely to be an ID column",
            ...         "data_check_name": "IDColumnsDataCheck",
            ...         "level": "warning",
            ...         "details": {"columns": ["Country_Rank"], "rows": None},
            ...         "code": "HAS_ID_COLUMN",
            ...         "action_options": [
            ...             {
            ...                 "code": "DROP_COL",
            ...                 "data_check_name": "IDColumnsDataCheck",
            ...                 "parameters": {},
            ...                 "metadata": {"columns": ["Country_Rank"], "rows": None}
            ...             }
            ...         ]
            ...     }
            ... ]
        """
        messages = []
        X = infer_feature_types(X)

        col_names = [col for col in X.columns]
        cols_named_id = [
            col for col in col_names if (str(col).lower() == "id")
        ]  # columns whose name is "id"
        id_cols = {col: 0.95 for col in cols_named_id}

        X = X.ww.select(include=["Integer", "Categorical"])

        check_all_unique = X.nunique() == len(X)
        cols_with_all_unique = check_all_unique[
            check_all_unique
        ].index.tolist()  # columns whose values are all unique
        id_cols.update(
            [
                (col, 1.0) if col in id_cols else (col, 0.95)
                for col in cols_with_all_unique
            ],
        )

        col_ends_with_id = [
            col for col in col_names if str(col).lower().endswith("_id")
        ]  # columns whose name ends with "_id"
        id_cols.update(
            [
                (col, 1.0) if str(col) in id_cols else (col, 0.95)
                for col in col_ends_with_id
            ],
        )

        id_cols_above_threshold = {
            key: value for key, value in id_cols.items() if value >= self.id_threshold
        }
        if id_cols_above_threshold:
            warning_msg = "Columns {} are {}% or more likely to be an ID column"
            messages.append(
                DataCheckWarning(
                    message=warning_msg.format(
                        (", ").join(
                            [
                                "'{}'".format(str(col))
                                for col in id_cols_above_threshold
                            ],
                        ),
                        self.id_threshold * 100,
                    ),
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.HAS_ID_COLUMN,
                    details={"columns": list(id_cols_above_threshold)},
                    action_options=[
                        DataCheckActionOption(
                            DataCheckActionCode.DROP_COL,
                            data_check_name=self.name,
                            metadata={"columns": list(id_cols_above_threshold)},
                        ),
                    ],
                ).to_dict(),
            )

        return messages