Source code for evalml.data_checks.datetime_nan_data_check

"""Data check that checks each column in the input for datetime features and will issue an error if NaN values are present."""

from evalml.data_checks import DataCheck, DataCheckError, DataCheckMessageCode
from evalml.utils.woodwork_utils import infer_feature_types

error_contains_nan = "Input datetime column(s) ({}) contains NaN values. Please impute NaN values or drop these rows or columns."


[docs]class DateTimeNaNDataCheck(DataCheck): """Check each column in the input for datetime features and will issue an error if NaN values are present."""
[docs] def validate(self, X, y=None): """Check if any datetime columns contain NaN values. Args: X (pd.DataFrame, np.ndarray): Features. y (pd.Series, np.ndarray): Ignored. Defaults to None. Returns: dict: dict with a DataCheckError if NaN values are present in datetime columns. Examples: >>> import pandas as pd >>> import numpy as np ... >>> dates = [["2-1-21", "3-1-21"], ... ["2-2-21", "3-2-21"], ... ["2-3-21", "3-3-21"], ... ["2-4-21", "3-4-21"]] >>> df = pd.DataFrame(dates, columns=["index", "days"]) >>> dt_nan_dc = DateTimeNaNDataCheck() >>> assert dt_nan_dc.validate(df) == [] The first value in the column "index" is replaced with NaT, which will raise an error in this data check. >>> dates[0][0] = np.datetime64("NaT") >>> df = pd.DataFrame(dates, columns=["index", "days"]) >>> assert dt_nan_dc.validate(df) == [ ... { ... "message": "Input datetime column(s) (index) contains NaN values. Please impute NaN values or drop these rows or columns.", ... "data_check_name": "DateTimeNaNDataCheck", ... "level": "error", ... "details": {"columns": ["index"], "rows": None}, ... "code": "DATETIME_HAS_NAN", ... "action_options": [] ... } ... ] ... The value None will be treated the same way. >>> dates[0][1] = None >>> df = pd.DataFrame(dates, columns=["index", "days"]) >>> assert dt_nan_dc.validate(df) == [ ... { ... "message": "Input datetime column(s) (index, days) contains NaN values. Please impute NaN values or drop these rows or columns.", ... "data_check_name": "DateTimeNaNDataCheck", ... "level": "error", ... "details": {"columns": ["index", "days"], "rows": None}, ... "code": "DATETIME_HAS_NAN", ... "action_options": [] ... } ... ] ... As will pd.NA. >>> dates[0][1] = pd.NA >>> df = pd.DataFrame(dates, columns=["index", "days"]) >>> assert dt_nan_dc.validate(df) == [ ... { ... "message": "Input datetime column(s) (index, days) contains NaN values. Please impute NaN values or drop these rows or columns.", ... "data_check_name": "DateTimeNaNDataCheck", ... "level": "error", ... "details": {"columns": ["index", "days"], "rows": None}, ... "code": "DATETIME_HAS_NAN", ... "action_options": [] ... } ... ] """ messages = [] X = infer_feature_types(X) datetime_cols = X.ww.select("datetime") nan_columns = datetime_cols.columns[datetime_cols.isna().any()].tolist() if len(nan_columns) > 0: nan_columns = [str(col) for col in nan_columns] cols_str = ( ", ".join(nan_columns) if len(nan_columns) > 1 else nan_columns[0] ) messages.append( DataCheckError( message=error_contains_nan.format(cols_str), data_check_name=self.name, message_code=DataCheckMessageCode.DATETIME_HAS_NAN, details={"columns": nan_columns}, ).to_dict() ) return messages