Source code for evalml.data_checks.default_data_checks
"""A default set of data checks that can be used for a variety of datasets."""fromevalml.data_checks.class_imbalance_data_checkimportClassImbalanceDataCheckfromevalml.data_checks.data_checksimportDataChecksfromevalml.data_checks.datetime_format_data_checkimportDateTimeFormatDataCheckfromevalml.data_checks.id_columns_data_checkimportIDColumnsDataCheckfromevalml.data_checks.invalid_target_data_checkimportInvalidTargetDataCheckfromevalml.data_checks.no_variance_data_checkimportNoVarianceDataCheckfromevalml.data_checks.null_data_checkimportNullDataCheckfromevalml.data_checks.target_distribution_data_checkimport(TargetDistributionDataCheck,)fromevalml.data_checks.target_leakage_data_checkimportTargetLeakageDataCheckfromevalml.data_checks.ts_parameters_data_checkimportTimeSeriesParametersDataCheckfromevalml.data_checks.ts_splitting_data_checkimportTimeSeriesSplittingDataCheckfromevalml.problem_typesimport(ProblemTypes,handle_problem_types,is_classification,is_time_series,)
[docs]classDefaultDataChecks(DataChecks):"""A collection of basic data checks that is used by AutoML by default. Includes: - `NullDataCheck` - `HighlyNullRowsDataCheck` - `IDColumnsDataCheck` - `TargetLeakageDataCheck` - `InvalidTargetDataCheck` - `NoVarianceDataCheck` - `ClassImbalanceDataCheck` (for classification problem types) - `TargetDistributionDataCheck` (for regression problem types) - `DateTimeFormatDataCheck` (for time series problem types) - 'TimeSeriesParametersDataCheck' (for time series problem types) - `TimeSeriesSplittingDataCheck` (for time series classification problem types) Args: problem_type (str): The problem type that is being validated. Can be regression, binary, or multiclass. objective (str or ObjectiveBase): Name or instance of the objective class. n_splits (int): The number of splits as determined by the data splitter being used. Defaults to 3. problem_configuration (dict): Required for time series problem types. Values should be passed in for time_index, gap, forecast_horizon, and max_delay. """_DEFAULT_DATA_CHECK_CLASSES=[NullDataCheck,IDColumnsDataCheck,TargetLeakageDataCheck,InvalidTargetDataCheck,NoVarianceDataCheck,]def__init__(self,problem_type,objective,n_splits=3,problem_configuration=None):default_checks=self._DEFAULT_DATA_CHECK_CLASSESdata_check_params={}ifis_time_series(problem_type):ifproblem_configurationisNone:raiseValueError("problem_configuration cannot be None for time series problems!",)ifis_classification(problem_type):default_checks=default_checks+[TimeSeriesSplittingDataCheck]data_check_params.update({"TimeSeriesSplittingDataCheck":{"problem_type":problem_type,"n_splits":n_splits,},},)default_checks=default_checks+[DateTimeFormatDataCheck,TimeSeriesParametersDataCheck,]data_check_params.update({"DateTimeFormatDataCheck":{"datetime_column":problem_configuration["time_index"],},"TimeSeriesParametersDataCheck":{"problem_configuration":problem_configuration,"n_splits":n_splits,},},)ifhandle_problem_types(problem_type)in[ProblemTypes.REGRESSION,ProblemTypes.TIME_SERIES_REGRESSION,]:default_checks=default_checks+[TargetDistributionDataCheck]data_check_params.update({"InvalidTargetDataCheck":{"problem_type":problem_type,"objective":objective,},},)else:default_checks=default_checks+[ClassImbalanceDataCheck]data_check_params.update({"InvalidTargetDataCheck":{"problem_type":problem_type,"objective":objective,},"ClassImbalanceDataCheck":{"num_cv_folds":n_splits},},)super().__init__(default_checks,data_check_params=data_check_params,)