Source code for evalml.data_checks.target_distribution_data_check

import numpy as np
import woodwork as ww
from scipy.stats import shapiro

from evalml.data_checks import (
    DataCheck,
    DataCheckAction,
    DataCheckActionCode,
    DataCheckError,
    DataCheckMessageCode,
    DataCheckWarning,
)
from evalml.utils import infer_feature_types


[docs]class TargetDistributionDataCheck(DataCheck): """Checks if the target data contains certain distributions that may need to be transformed prior training to improve model performance."""
[docs] def validate(self, X, y): """Checks if the target data has a certain distribution. Arguments: X (pd.DataFrame, np.ndarray): Features. Ignored. y (pd.Series, np.ndarray): Target data to check for underlying distributions. Returns: dict (DataCheckError): List with DataCheckErrors if certain distributions are found in the target data. Example: >>> from scipy.stats import lognorm >>> X = None >>> y = [0.946, 0.972, 1.154, 0.954, 0.969, 1.222, 1.038, 0.999, 0.973, 0.897] >>> target_check = TargetDistributionDataCheck() >>> assert target_check.validate(X, y) == {"errors": [],\ "warnings": [{"message": "Target may have a lognormal distribution.",\ "data_check_name": "TargetDistributionDataCheck",\ "level": "warning",\ "code": "TARGET_LOGNORMAL_DISTRIBUTION",\ "details": {"shapiro-statistic/pvalue": '0.84/0.045'}}],\ "actions": [{'code': 'TRANSFORM_TARGET', 'metadata': {'column': None, 'transformation_strategy': 'lognormal', 'is_target': True}}]} """ results = {"warnings": [], "errors": [], "actions": []} if y is None: results["errors"].append( DataCheckError( message="Target is None", data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_IS_NONE, details={}, ).to_dict() ) return results y = infer_feature_types(y) allowed_types = [ ww.logical_types.Integer.type_string, ww.logical_types.Double.type_string, ] is_supported_type = y.ww.logical_type.type_string in allowed_types if not is_supported_type: results["errors"].append( DataCheckError( message="Target is unsupported {} type. Valid Woodwork logical types include: {}".format( y.ww.logical_type.type_string, ", ".join([ltype for ltype in allowed_types]), ), data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE, details={"unsupported_type": y.ww.logical_type.type_string}, ).to_dict() ) return results # Check if a normal distribution is detected with p-value above 0.05 if shapiro(y).pvalue >= 0.05: return results y_new = round(y, 6) if any(y <= 0): y_new = y + abs(y.min()) + 1 y_new = y_new[ y_new < (y_new.mean() + 3 * round(y.std(), 3)) ] # Drop values greater than 3 standard deviations shapiro_test_og = shapiro(y_new) shapiro_test_log = shapiro(np.log(y_new)) log_detected = False # If the p-value of the log transformed target is greater than or equal to the p-value of the original target # with outliers dropped, then it would imply that the log transformed target has more of a normal distribution if shapiro_test_log.pvalue >= shapiro_test_og.pvalue: log_detected = True if log_detected: details = { "shapiro-statistic/pvalue": f"{round(shapiro_test_og.statistic, 2)}/{round(shapiro_test_og.pvalue, 3)}" } results["warnings"].append( DataCheckWarning( message="Target may have a lognormal distribution.", data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_LOGNORMAL_DISTRIBUTION, details=details, ).to_dict() ) results["actions"].append( DataCheckAction( DataCheckActionCode.TRANSFORM_TARGET, metadata={ "column": None, "is_target": True, "transformation_strategy": "lognormal", }, ).to_dict() ) return results