from evalml.pipelines.components.transformers import Transformer
from evalml.utils import _convert_woodwork_types_wrapper, infer_feature_types
[docs]class DropNullColumns(Transformer):
"""Transformer to drop features whose percentage of NaN values exceeds a specified threshold"""
name = "Drop Null Columns Transformer"
hyperparameter_ranges = {}
[docs] def __init__(self, pct_null_threshold=1.0, random_seed=0, **kwargs):
"""Initalizes an transformer to drop features whose percentage of NaN values exceeds a specified threshold.
Arguments:
pct_null_threshold(float): The percentage of NaN values in an input feature to drop.
Must be a value between [0, 1] inclusive. If equal to 0.0, will drop columns with any null values.
If equal to 1.0, will drop columns with all null values. Defaults to 0.95.
random_seed (int): Seed for the random number generator. Defaults to 0.
"""
if pct_null_threshold < 0 or pct_null_threshold > 1:
raise ValueError("pct_null_threshold must be a float between 0 and 1, inclusive.")
parameters = {"pct_null_threshold": pct_null_threshold}
parameters.update(kwargs)
self._cols_to_drop = None
super().__init__(parameters=parameters,
component_obj=None,
random_seed=random_seed)
[docs] def fit(self, X, y=None):
pct_null_threshold = self.parameters["pct_null_threshold"]
X_t = infer_feature_types(X)
X_t = _convert_woodwork_types_wrapper(X_t.to_dataframe())
percent_null = X_t.isnull().mean()
if pct_null_threshold == 0.0:
null_cols = percent_null[percent_null > 0]
else:
null_cols = percent_null[percent_null >= pct_null_threshold]
self._cols_to_drop = list(null_cols.index)
return self