Source code for evalml.data_checks.default_data_checks

"""A default set of data checks that can be used for a variety of datasets."""
from .class_imbalance_data_check import ClassImbalanceDataCheck
from .data_checks import DataChecks
from .datetime_format_data_check import DateTimeFormatDataCheck
from .id_columns_data_check import IDColumnsDataCheck
from .invalid_target_data_check import InvalidTargetDataCheck
from .no_variance_data_check import NoVarianceDataCheck
from .null_data_check import NullDataCheck
from .target_distribution_data_check import TargetDistributionDataCheck
from .target_leakage_data_check import TargetLeakageDataCheck
from .ts_parameters_data_check import TimeSeriesParametersDataCheck
from .ts_splitting_data_check import TimeSeriesSplittingDataCheck

from evalml.problem_types import (
    ProblemTypes,
    handle_problem_types,
    is_classification,
    is_time_series,
)


[docs]class DefaultDataChecks(DataChecks): """A collection of basic data checks that is used by AutoML by default. Includes: - `NullDataCheck` - `HighlyNullRowsDataCheck` - `IDColumnsDataCheck` - `TargetLeakageDataCheck` - `InvalidTargetDataCheck` - `NoVarianceDataCheck` - `ClassImbalanceDataCheck` (for classification problem types) - `TargetDistributionDataCheck` (for regression problem types) - `DateTimeFormatDataCheck` (for time series problem types) - 'TimeSeriesParametersDataCheck' (for time series problem types) - `TimeSeriesSplittingDataCheck` (for time series classification problem types) Args: problem_type (str): The problem type that is being validated. Can be regression, binary, or multiclass. objective (str or ObjectiveBase): Name or instance of the objective class. n_splits (int): The number of splits as determined by the data splitter being used. Defaults to 3. problem_configuration (dict): Required for time series problem types. Values should be passed in for time_index, gap, forecast_horizon, and max_delay. """ _DEFAULT_DATA_CHECK_CLASSES = [ NullDataCheck, IDColumnsDataCheck, TargetLeakageDataCheck, InvalidTargetDataCheck, NoVarianceDataCheck, ] def __init__(self, problem_type, objective, n_splits=3, problem_configuration=None): default_checks = self._DEFAULT_DATA_CHECK_CLASSES data_check_params = {} if is_time_series(problem_type): if problem_configuration is None: raise ValueError( "problem_configuration cannot be None for time series problems!" ) if is_classification(problem_type): default_checks = default_checks + [TimeSeriesSplittingDataCheck] data_check_params.update( { "TimeSeriesSplittingDataCheck": { "problem_type": problem_type, "n_splits": n_splits, } } ) default_checks = default_checks + [ DateTimeFormatDataCheck, TimeSeriesParametersDataCheck, ] data_check_params.update( { "DateTimeFormatDataCheck": { "datetime_column": problem_configuration["time_index"], }, "TimeSeriesParametersDataCheck": { "problem_configuration": problem_configuration, "n_splits": n_splits, }, } ) if handle_problem_types(problem_type) in [ ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, ]: default_checks = default_checks + [TargetDistributionDataCheck] data_check_params.update( { "InvalidTargetDataCheck": { "problem_type": problem_type, "objective": objective, } } ) else: default_checks = default_checks + [ClassImbalanceDataCheck] data_check_params.update( { "InvalidTargetDataCheck": { "problem_type": problem_type, "objective": objective, }, "ClassImbalanceDataCheck": {"num_cv_folds": n_splits}, } ) super().__init__( default_checks, data_check_params=data_check_params, )