evalml.data_checks.HighlyNullDataCheck.validate

HighlyNullDataCheck.validate(X, y=None)[source]

Checks if there are any highly-null columns or rows in the input.

Parameters
  • X (pd.DataFrame, np.ndarray) – Features.

  • y (pd.Series, np.ndarray) – Ignored.

Returns

dict with a DataCheckWarning if there are any highly-null columns or rows.

Return type

dict

Example

>>> import pandas as pd
>>> class SeriesWrap():
...     def __init__(self, series):
...         self.series = series
...
...     def __eq__(self, series_2):
...         return all(self.series.eq(series_2.series))
...
>>> df = pd.DataFrame({
...    'lots_of_null': [None, None, None, None, 5],
...    'no_null': [1, 2, 3, 4, 5]
... })
>>> null_check = HighlyNullDataCheck(pct_null_threshold=0.50)
>>> validation_results = null_check.validate(df)
>>> validation_results['warnings'][0]['details']['pct_null_cols'] = SeriesWrap(validation_results['warnings'][0]['details']['pct_null_cols'])
>>> highly_null_rows = SeriesWrap(pd.Series([0.5, 0.5, 0.5, 0.5]))
>>> assert validation_results== {"errors": [],                                            "warnings": [{"message": "4 out of 5 rows are more than 50.0% null",                                                            "data_check_name": "HighlyNullDataCheck",                                                            "level": "warning",                                                            "code": "HIGHLY_NULL_ROWS",                                                            "details": {"pct_null_cols": highly_null_rows}},                                                            {"message": "Column 'lots_of_null' is 50.0% or more null",                                                            "data_check_name": "HighlyNullDataCheck",                                                            "level": "warning",                                                            "code": "HIGHLY_NULL_COLS",                                                            "details": {"column": "lots_of_null", "pct_null_rows": 0.8}}],                                            "actions": [{"code": "DROP_ROWS", "metadata": {"rows": [0, 1, 2, 3]}},                                                        {"code": "DROP_COL",                                                            "metadata": {"column": "lots_of_null"}}]}