Source code for evalml.data_checks.outliers_data_check

"""Data check that checks if there are any outliers in input data by using IQR to determine score anomalies."""
import numpy as np
from scipy.stats import gamma

from evalml.data_checks import (
    DataCheck,
    DataCheckAction,
    DataCheckActionCode,
    DataCheckMessageCode,
    DataCheckWarning,
)
from evalml.utils import infer_feature_types


[docs]class OutliersDataCheck(DataCheck):
    """Checks if there are any outliers in input data by using IQR to determine score anomalies.

    Columns with score anomalies are considered to contain outliers.
    """

[docs]    def validate(self, X, y=None):
        """Check if there are any outliers in a dataframe by using IQR to determine column anomalies. Column with anomalies are considered to contain outliers.

        Args:
            X (pd.DataFrame, np.ndarray): Input features.
            y (pd.Series, np.ndarray): Ignored. Defaults to None.

        Returns:
            dict: A dictionary with warnings if any columns have outliers.

        Examples:
            >>> import pandas as pd
            ...
            >>> df = pd.DataFrame({
            ...     'x': [1, 2, 3, 4, 5],
            ...     'y': [6, 7, 8, 9, 10],
            ...     'z': [-1, -2, -3, -1201, -4]
            ... })
            ...
            >>> outliers_check = OutliersDataCheck()
            >>> assert outliers_check.validate(df) == {
            ...     "warnings": [{"message": "Column(s) 'z' are likely to have outlier data.",
            ...                   "data_check_name": "OutliersDataCheck",
            ...                   "level": "warning",
            ...                   "code": "HAS_OUTLIERS",
            ...                   "details": {"columns": ["z"], "rows": [3], "column_indices": {"z": [3]}}}],
            ...     "errors": [],
            ...     "actions": [{"code": "DROP_ROWS", "metadata": {"rows": [3], "columns": None}}]}
            ...
            ...
            >>> box_plot_data = OutliersDataCheck.get_boxplot_data(df['z'])
            >>> box_plot_data["score"] = round(box_plot_data["score"], 2)
            >>> assert box_plot_data == {
            ...     'score': 0.89,
            ...     'pct_outliers': 0.2,
            ...     'values': {'q1': -4.0,
            ...                'median': -3.0,
            ...                'q3': -2.0,
            ...                'low_bound': -7.0,
            ...                'high_bound': 1.0,
            ...                'low_values': [-1201],
            ...                'high_values': [],
            ...                'low_indices': [3],
            ...                'high_indices': []}}
        """
        results = {"warnings": [], "errors": [], "actions": []}

        X = infer_feature_types(X)
        X = X.ww.select("numeric")

        if len(X.columns) == 0:
            return results

        has_outliers = []
        outlier_row_indices = {}
        for col in X.columns:
            box_plot_dict = OutliersDataCheck.get_boxplot_data(X.ww[col])
            box_plot_dict_values = box_plot_dict["values"]

            pct_outliers = box_plot_dict["pct_outliers"]
            if pct_outliers > 0 and box_plot_dict["score"] <= 0.9:
                has_outliers.append(col)
                outlier_row_indices[col] = (
                    box_plot_dict_values["low_indices"]
                    + box_plot_dict_values["high_indices"]
                )

        if not len(has_outliers):
            return results

        warning_msg = "Column(s) {} are likely to have outlier data.".format(
            ", ".join([f"'{col}'" for col in has_outliers])
        )
        all_rows_with_indices_set = set()
        for row_indices in outlier_row_indices.values():
            all_rows_with_indices_set.update(row_indices)

        all_rows_with_indices = list(all_rows_with_indices_set)
        all_rows_with_indices.sort()
        results["warnings"].append(
            DataCheckWarning(
                message=warning_msg,
                data_check_name=self.name,
                message_code=DataCheckMessageCode.HAS_OUTLIERS,
                details={
                    "columns": has_outliers,
                    "rows": all_rows_with_indices,
                    "column_indices": outlier_row_indices,
                },
            ).to_dict()
        )

        results["actions"].append(
            DataCheckAction(
                DataCheckActionCode.DROP_ROWS,
                metadata={"rows": all_rows_with_indices},
            ).to_dict()
        )
        return results

[docs]    @staticmethod
    def get_boxplot_data(data_):
        """Returns box plot information for the given data.

        Args:
            data_ (pd.Series, np.ndarray): Input data.

        Returns:
            dict: A payload of box plot statistics.
        """
        data_ = infer_feature_types(data_, ignore_nullable_types=True)
        num_records = data_.count()
        box_plot_dict = data_.ww.box_plot_dict()
        quantiles = box_plot_dict["quantiles"]

        q1, q2, q3 = quantiles[0.25], quantiles[0.5], quantiles[0.75]

        pct_outliers = (
            len(box_plot_dict["low_values"]) + len(box_plot_dict["high_values"])
        ) / num_records
        score = OutliersDataCheck._no_outlier_prob(num_records, pct_outliers)

        payload = {
            "score": score,
            "pct_outliers": pct_outliers,
            "values": {
                "q1": q1,
                "median": q2,
                "q3": q3,
                "low_bound": box_plot_dict["low_bound"],
                "high_bound": box_plot_dict["high_bound"],
                "low_values": box_plot_dict["low_values"],
                "high_values": box_plot_dict["high_values"],
                "low_indices": box_plot_dict["low_indices"],
                "high_indices": box_plot_dict["high_indices"],
            },
        }
        return payload

    @staticmethod
    def _no_outlier_prob(num_records: int, pct_outliers: float) -> float:
        """Calculate the probability that there are no true outliers in a numeric (integer or float) column.

        It is based on creating 100,000 samples consisting of a given number of records, and then repeating
        this over a grid of sample sizes. Each value in a sample is drawn from a log normal distribution,
        and then the number of potential outliers in the data is determined using the skew adjusted box plot
        approach based on the medcouple statistic.

        It was observed that the distribution of the percentage of outliers could be described by a gamma distribution,
        with the shape and scale parameters changing with the sample size.
        For each sample size, the shape and scale parameters of the gamma distriubtion were estimated using maximum
        likelihood methods. The set of estimate shape and scale parameters for different sample size were then used
        to fit equations that relate these two parameters to the sample size.

        These equations use a transendental logrithmic functional form that provides a seventh order Taylor series
        approximation to the two true functional relationships, and was estimated using least squares regression.

        Original credit goes to Jad Raad and Dan Putler of Alteryx.

        Args:
            num_records (int): The integer number of non-missing values in a column.
            pct_outliers (float): The percentage of potential outliers in a column.

        Returns:
            float: The probability that no outliers are present in the column.
        """
        # Calculate the shape and scale parameters of the approximate
        # gamma distribution given the number of records in the data.
        # For both measures, the values are are from a least squares regression
        # model
        log_n = np.log(num_records)
        log_shape = (
            25.8218734380722
            + -29.2320460088643 * log_n
            + 14.8228030299864 * log_n ** 2
            + -4.08052512660036 * log_n ** 3
            + 0.641429075842177 * log_n ** 4
            + -0.0571252717322226 * log_n ** 5
            + 0.00268694343911156 * log_n ** 6
            + -5.19415149920567e-05 * log_n ** 7
        )
        shape_param = np.exp(log_shape)
        log_scale = (
            -19.8196822259052
            + 18.5359212447622 * log_n
            + -8.80487628113388 * log_n ** 2
            + 2.27711870991327 * log_n ** 3
            + -0.344443407676357 * log_n ** 4
            + 0.029820831994345 * log_n ** 5
            + -0.00136611527293756 * log_n ** 6
            + 2.56727158170901e-05 * log_n ** 7
        )
        scale_param = np.exp(log_scale)

        # calculate and return the probability of no true outliers for a gamma
        # cumulative density function
        prob_val = 1.0 - gamma.cdf(pct_outliers, shape_param, scale=scale_param)
        return prob_val