Source code for evalml.pipelines.components.transformers.imputers.knn_imputer

"""Component that imputes missing data according to a specified imputation strategy."""
import numpy as np
import pandas as pd
import woodwork
from sklearn.impute import KNNImputer as Sk_KNNImputer

from evalml.pipelines.components.transformers import Transformer
from evalml.pipelines.components.utils import drop_natural_language_columns
from evalml.utils import infer_feature_types


[docs]class KNNImputer(Transformer): """Imputes missing data using KNN according to a specified number of neighbors. Natural language columns are ignored. Args: number_neighbors (int): Number of nearest neighbors for KNN to search for. Defaults to 3. random_seed (int): Seed for the random number generator. Defaults to 0. """ name = "KNN Imputer" def __init__(self, number_neighbors=3, random_seed=0, **kwargs): parameters = {"number_neighbors": number_neighbors} parameters.update(kwargs) imputer = Sk_KNNImputer( n_neighbors=number_neighbors, missing_values=np.nan, **kwargs, ) self._all_null_cols = None super().__init__( parameters=parameters, component_obj=imputer, random_seed=random_seed, )
[docs] def fit(self, X, y=None): """Fits imputer to data. 'None' values are converted to np.nan before imputation and are treated as the same. Args: X (pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features] y (pd.Series, optional): the target training data of length [n_samples] Returns: self Raises: ValueError: if the KNNImputer receives a dataframe with both Boolean and Categorical data. """ X = infer_feature_types(X) nan_ratio = X.ww.describe().loc["nan_count"] / X.shape[0] self._all_null_cols = nan_ratio[nan_ratio == 1].index.tolist() X, _ = drop_natural_language_columns(X) # If the Dataframe only had natural language columns, do nothing. if X.shape[1] == 0: return self self._component_obj.fit(X, y) return self
[docs] def transform(self, X, y=None): """Transforms input by imputing missing values. 'None' and np.nan values are treated as the same. Args: X (pd.DataFrame): Data to transform. y (pd.Series, optional): Ignored. Returns: pd.DataFrame: Transformed X """ X = infer_feature_types(X) not_all_null_cols = [col for col in X.columns if col not in self._all_null_cols] original_index = X.index # Drop natural language columns and transform the other columns X_t, natural_language_cols = drop_natural_language_columns(X) if X_t.shape[1] == 0: return X not_all_null_or_natural_language_cols = [ col for col in not_all_null_cols if col not in natural_language_cols ] X_t = self._component_obj.transform(X_t) X_t = pd.DataFrame(X_t, columns=not_all_null_or_natural_language_cols) X_schema = X.ww.schema X_bool_nullable_cols = X_schema._filter_cols(include=["BooleanNullable"]) new_ltypes_for_bool_nullable_cols = { col: "Boolean" for col in X_bool_nullable_cols } # Add back in natural language columns, unchanged if len(natural_language_cols) > 0: X_t = woodwork.concat_columns([X_t, X[natural_language_cols]]) X_t.ww.init( schema=X_schema, logical_types=new_ltypes_for_bool_nullable_cols, ) if not_all_null_or_natural_language_cols: X_t.index = original_index return X_t
[docs] def fit_transform(self, X, y=None): """Fits on X and transforms X. Args: X (pd.DataFrame): Data to fit and transform y (pd.Series, optional): Target data. Returns: pd.DataFrame: Transformed X """ return self.fit(X, y).transform(X, y)