Source code for evalml.pipelines.components.transformers.column_selectors

from abc import abstractmethod

import numpy as np

from evalml.pipelines.components.transformers import Transformer
from evalml.utils.gen_utils import (
    _convert_to_woodwork_structure,
    _convert_woodwork_types_wrapper
)


class ColumnSelector(Transformer):

    def __init__(self, columns=None, random_state=0, **kwargs):
        """Initalizes an transformer that drops specified columns in input data.

        Arguments:
            columns (list(string)): List of column names, used to determine which columns to drop.
        """
        if columns and not isinstance(columns, list):
            raise ValueError(f"Parameter columns must be a list. Received {type(columns)}.")

        parameters = {"columns": columns}
        parameters.update(kwargs)
        super().__init__(parameters=parameters,
                         component_obj=None,
                         random_state=random_state)

    def _check_input_for_columns(self, X):
        cols = self.parameters.get("columns") or []

        if isinstance(X, np.ndarray):
            column_names = range(X.shape[1])
        else:
            column_names = X.columns

        missing_cols = set(cols) - set(column_names)
        if missing_cols:
            raise ValueError(
                "Columns {} not found in input data".format(', '.join(f"'{col_name}'" for col_name in missing_cols))
            )

    @abstractmethod
    def _modify_columns(self, cols, X, y=None):
        """How the transformer modifies the columns of the input data."""

    def fit(self, X, y=None):
        """'Fits' the transformer by checking if the column names are present in the dataset.

        Arguments:
            X (pd.DataFrame): Data to check.
            y (pd.Series, optional): Targets.

        Returns:
            None.
        """

        self._check_input_for_columns(X)
        return self

    def transform(self, X, y=None):
        X = _convert_to_woodwork_structure(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())
        self._check_input_for_columns(X)

        cols = self.parameters.get("columns") or []
        return self._modify_columns(cols, X, y)

    def fit_transform(self, X, y=None):
        """Fit transformer to data, then transform data.

        Arguments:
            X (pd.DataFrame): Data to transform.
            y (pd.Series, optional): Targets.

        Returns:
            pd.DataFrame: Transformed X.
        """

        # transform method already calls fit under the hood.
        self.fit(X, y)
        return self.transform(X, y)


[docs]class DropColumns(ColumnSelector): """Drops specified columns in input data.""" name = "Drop Columns Transformer" hyperparameter_ranges = {} needs_fitting = False def _modify_columns(self, cols, X, y=None): return X.drop(columns=cols, axis=1)
[docs] def transform(self, X, y=None): """Transforms data X by dropping columns. Arguments: X (pd.DataFrame): Data to transform. y (pd.Series, optional): Targets. Returns: pd.DataFrame: Transformed X. """ return super().transform(X, y)
[docs]class SelectColumns(ColumnSelector): """Selects specified columns in input data.""" name = "Select Columns Transformer" hyperparameter_ranges = {} needs_fitting = False def _modify_columns(self, cols, X, y=None): return X[cols]
[docs] def transform(self, X, y=None): """Transforms data X by selecting columns. Arguments: X (pd.DataFrame): Data to transform. y (pd.Series, optional): Targets. Returns: pd.DataFrame: Transformed X. """ return super().transform(X, y)