Source code for evalml.data_checks.datetime_format_data_check
import pandas as pd
from evalml.data_checks import DataCheck, DataCheckError, DataCheckMessageCode
from evalml.utils import infer_feature_types
[docs]class DateTimeFormatDataCheck(DataCheck):
"""Checks if the datetime column has equally spaced intervals and is monotonically increasing or decreasing in order
to be supported by time series estimators.
Arguments:
datetime_column (str, int): The name of the datetime column. If the datetime values are in the index, then pass "index".
"""
def __init__(self, datetime_column="index"):
self.datetime_column = datetime_column
[docs] def validate(self, X, y):
"""Checks if the target data has equal intervals and is sorted.
Arguments:
X (pd.DataFrame, np.ndarray): Features.
y (pd.Series, np.ndarray): Target data.
Returns:
dict (DataCheckError): List with DataCheckErrors if unequal intervals are found in the datetime column.
Example:
>>> from pandas as pd
>>> X = pd.DataFrame(pd.date_range("January 1, 2021", periods=8), columns=["dates"])
>>> y = pd.Series([1, 2, 4, 2, 1, 2, 3, 1])
>>> X.iloc[7] = "January 9, 2021"
>>> datetime_format_check = DateTimeFormatDataCheck()
>>> assert datetime_format_check.validate(X, y) == {
... "errors": [{"message": "No frequency could be detected in dates, possibly due to uneven intervals.",
... "data_check_name": "EqualIntervalDataCheck",
... "level": "error",
... "code": "DATETIME_HAS_UNEVEN_INTERVALS",
... "details": {}}],
... "warnings": [],
... "actions": []}
"""
results = {"warnings": [], "errors": [], "actions": []}
X = infer_feature_types(X)
y = infer_feature_types(y)
no_dt_found = False
if self.datetime_column != "index":
datetime_values = X[self.datetime_column]
else:
datetime_values = X.index
if not isinstance(datetime_values, pd.DatetimeIndex):
datetime_values = y.index
if not isinstance(datetime_values, pd.DatetimeIndex):
no_dt_found = True
try:
inferred_freq = pd.infer_freq(datetime_values)
except TypeError:
no_dt_found = True
if no_dt_found:
results["errors"].append(
DataCheckError(
message=f"Datetime information could not be found in the data, or was not in a supported datetime format.",
data_check_name=self.name,
message_code=DataCheckMessageCode.DATETIME_INFORMATION_NOT_FOUND,
).to_dict()
)
return results
if not inferred_freq:
col_name = (
self.datetime_column
if self.datetime_column != "index"
else "either index"
)
results["errors"].append(
DataCheckError(
message=f"No frequency could be detected in {col_name}, possibly due to uneven intervals.",
data_check_name=self.name,
message_code=DataCheckMessageCode.DATETIME_HAS_UNEVEN_INTERVALS,
).to_dict()
)
if not (pd.DatetimeIndex(datetime_values).is_monotonic_increasing):
results["errors"].append(
DataCheckError(
message="Datetime values must be sorted in ascending order.",
data_check_name=self.name,
message_code=DataCheckMessageCode.DATETIME_IS_NOT_MONOTONIC,
).to_dict()
)
return results