Source code for evalml.data_checks.default_data_checks

from .class_imbalance_data_check import ClassImbalanceDataCheck
from .data_checks import DataChecks
from .datetime_nan_data_check import DateTimeNaNDataCheck
from .highly_null_data_check import HighlyNullDataCheck
from .id_columns_data_check import IDColumnsDataCheck
from .invalid_targets_data_check import InvalidTargetDataCheck
from .natural_language_nan_data_check import NaturalLanguageNaNDataCheck
from .no_variance_data_check import NoVarianceDataCheck
from .target_distribution_data_check import TargetDistributionDataCheck
from .target_leakage_data_check import TargetLeakageDataCheck

from evalml.problem_types import ProblemTypes, handle_problem_types


[docs]class DefaultDataChecks(DataChecks): """A collection of basic data checks that is used by AutoML by default. Includes: - `HighlyNullDataCheck` - `HighlyNullRowsDataCheck` - `IDColumnsDataCheck` - `TargetLeakageDataCheck` - `InvalidTargetDataCheck` - `NoVarianceDataCheck` - `ClassImbalanceDataCheck` (for classification problem types) - `DateTimeNaNDataCheck` - `NaturalLanguageNaNDataCheck` - `TargetDistributionDataCheck` Arguments: problem_type (str): The problem type that is being validated. Can be regression, binary, or multiclass. objective (str or ObjectiveBase): Name or instance of the objective class. n_splits (int): The number of splits as determined by the data splitter being used. Defaults to 3. """ _DEFAULT_DATA_CHECK_CLASSES = [ HighlyNullDataCheck, IDColumnsDataCheck, TargetLeakageDataCheck, InvalidTargetDataCheck, NoVarianceDataCheck, NaturalLanguageNaNDataCheck, DateTimeNaNDataCheck, ] def __init__(self, problem_type, objective, n_splits=3): if handle_problem_types(problem_type) in [ ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION, ]: super().__init__( self._DEFAULT_DATA_CHECK_CLASSES + [TargetDistributionDataCheck], data_check_params={ "InvalidTargetDataCheck": { "problem_type": problem_type, "objective": objective, }, }, ) else: super().__init__( self._DEFAULT_DATA_CHECK_CLASSES + [ClassImbalanceDataCheck], data_check_params={ "InvalidTargetDataCheck": { "problem_type": problem_type, "objective": objective, }, "ClassImbalanceDataCheck": {"num_cv_folds": n_splits}, }, )