Source code for evalml.data_checks.data_checks
"""A collection of data checks."""
import inspect
from evalml.data_checks import DataCheck
from evalml.exceptions import DataCheckInitError
from evalml.utils import infer_feature_types
def _has_defaults_for_all_args(init):
"""Test whether the init method has defaults for all arguments."""
signature = inspect.getfullargspec(init)
n_default_args = 0 if not signature.defaults else len(signature.defaults)
n_args = (
len(signature.args) - 1 if "self" in signature.args else len(signature.args)
)
return n_args == n_default_args
[docs]class DataChecks:
"""A collection of data checks.
Args:
data_checks (list (DataCheck)): List of DataCheck objects.
data_check_params (dict): Parameters for passed DataCheck objects.
"""
@staticmethod
def _validate_data_checks(data_check_classes, params):
"""Creates a DataChecks instance from a list of DataCheck classes and corresponding params."""
if not isinstance(data_check_classes, list):
raise ValueError(
f"Parameter data_checks must be a list. Received {type(data_check_classes).__name__}.",
)
if not all(
inspect.isclass(check) and issubclass(check, DataCheck)
for check in data_check_classes
):
raise ValueError(
"All elements of parameter data_checks must be an instance of DataCheck "
"or a DataCheck class with any desired parameters specified in the "
"data_check_params dictionary.",
)
params = params or dict()
if not isinstance(params, dict):
raise ValueError(f"Params must be a dictionary. Received {params}")
in_params = set(params.keys())
in_classes = set([c.name for c in data_check_classes])
name_to_class = {c.name: c for c in data_check_classes}
extraneous = in_params.difference(in_classes)
missing = in_classes.difference(in_params)
for extraneous_class in extraneous:
raise DataCheckInitError(
f"Class {extraneous_class} was provided in params dictionary but it does not match any name "
"in the data_check_classes list. Make sure every key of the params dictionary matches the name"
"attribute of a corresponding DataCheck class.",
)
for missing_class_name in missing:
if not _has_defaults_for_all_args(name_to_class[missing_class_name]):
raise DataCheckInitError(
f"Class {missing_class_name} was provided in the data_checks_classes list but it does not have "
"an entry in the parameters dictionary.",
)
@staticmethod
def _init_data_checks(data_check_classes, params):
data_check_instances = []
for data_check_class in data_check_classes:
class_params = params.get(data_check_class.name, {})
if not isinstance(class_params, dict):
raise DataCheckInitError(
f"Parameters for {data_check_class.name} were not in a dictionary. Received {class_params}.",
)
try:
data_check_instances.append(data_check_class(**class_params))
except TypeError as e:
raise DataCheckInitError(
f"Encountered the following error while initializing {data_check_class.name}: {e}",
)
return data_check_instances
def __init__(self, data_checks=None, data_check_params=None):
data_check_params = data_check_params or dict()
self._validate_data_checks(data_checks, data_check_params)
data_check_instances = self._init_data_checks(data_checks, data_check_params)
self.data_checks = data_check_instances
[docs] def validate(self, X, y=None):
"""Inspect and validate the input data against data checks and returns a list of warnings and errors if applicable.
Args:
X (pd.DataFrame, np.ndarray): The input data of shape [n_samples, n_features]
y (pd.Series, np.ndarray): The target data of length [n_samples]
Returns:
dict: Dictionary containing DataCheckMessage objects
"""
messages = []
existing_schema = X.ww.schema
X.ww.init(schema=existing_schema, already_sorted=True)
X = X.ww.drop(list(X.ww.select("index", return_schema=True).columns))
if y is not None:
y = infer_feature_types(y)
for data_check in self.data_checks:
messages_new = data_check.validate(X, y)
messages.extend(messages_new)
return messages