"""A collection of data checks."""importinspectfromevalml.data_checksimportDataCheckfromevalml.exceptionsimportDataCheckInitErrorfromevalml.utilsimportinfer_feature_typesdef_has_defaults_for_all_args(init):"""Test whether the init method has defaults for all arguments."""signature=inspect.getfullargspec(init)n_default_args=0ifnotsignature.defaultselselen(signature.defaults)n_args=(len(signature.args)-1if"self"insignature.argselselen(signature.args))returnn_args==n_default_args
[docs]classDataChecks:"""A collection of data checks. Args: data_checks (list (DataCheck)): List of DataCheck objects. data_check_params (dict): Parameters for passed DataCheck objects. """@staticmethoddef_validate_data_checks(data_check_classes,params):"""Creates a DataChecks instance from a list of DataCheck classes and corresponding params."""ifnotisinstance(data_check_classes,list):raiseValueError(f"Parameter data_checks must be a list. Received {type(data_check_classes).__name__}.",)ifnotall(inspect.isclass(check)andissubclass(check,DataCheck)forcheckindata_check_classes):raiseValueError("All elements of parameter data_checks must be an instance of DataCheck ""or a DataCheck class with any desired parameters specified in the ""data_check_params dictionary.",)params=paramsordict()ifnotisinstance(params,dict):raiseValueError(f"Params must be a dictionary. Received {params}")in_params=set(params.keys())in_classes=set([c.nameforcindata_check_classes])name_to_class={c.name:cforcindata_check_classes}extraneous=in_params.difference(in_classes)missing=in_classes.difference(in_params)forextraneous_classinextraneous:raiseDataCheckInitError(f"Class {extraneous_class} was provided in params dictionary but it does not match any name ""in the data_check_classes list. Make sure every key of the params dictionary matches the name""attribute of a corresponding DataCheck class.",)formissing_class_nameinmissing:ifnot_has_defaults_for_all_args(name_to_class[missing_class_name]):raiseDataCheckInitError(f"Class {missing_class_name} was provided in the data_checks_classes list but it does not have ""an entry in the parameters dictionary.",)@staticmethoddef_init_data_checks(data_check_classes,params):data_check_instances=[]fordata_check_classindata_check_classes:class_params=params.get(data_check_class.name,{})ifnotisinstance(class_params,dict):raiseDataCheckInitError(f"Parameters for {data_check_class.name} were not in a dictionary. Received {class_params}.",)try:data_check_instances.append(data_check_class(**class_params))exceptTypeErrorase:raiseDataCheckInitError(f"Encountered the following error while initializing {data_check_class.name}: {e}",)returndata_check_instancesdef__init__(self,data_checks=None,data_check_params=None):data_check_params=data_check_paramsordict()self._validate_data_checks(data_checks,data_check_params)data_check_instances=self._init_data_checks(data_checks,data_check_params)self.data_checks=data_check_instances
[docs]defvalidate(self,X,y=None):"""Inspect and validate the input data against data checks and returns a list of warnings and errors if applicable. Args: X (pd.DataFrame, np.ndarray): The input data of shape [n_samples, n_features] y (pd.Series, np.ndarray): The target data of length [n_samples] Returns: dict: Dictionary containing DataCheckMessage objects """messages=[]existing_schema=X.ww.schemaX.ww.init(schema=existing_schema,already_sorted=True)X=X.ww.drop(list(X.ww.select("index",return_schema=True).columns))ifyisnotNone:y=infer_feature_types(y)fordata_checkinself.data_checks:messages_new=data_check.validate(X,y)messages.extend(messages_new)returnmessages