Source code for evalml.pipelines.components.transformers.feature_selection.feature_selector
"""Component that selects top features based on importance weights."""
import pandas as pd
from evalml.exceptions import MethodPropertyNotFoundError
from evalml.pipelines.components.transformers import Transformer
from evalml.utils import infer_feature_types
[docs]class FeatureSelector(Transformer):
"""Selects top features based on importance weights.
Args:
parameters (dict): Dictionary of parameters for the component. Defaults to None.
component_obj (obj): Third-party objects useful in component implementation. Defaults to None.
random_seed (int): Seed for the random number generator. Defaults to 0.
"""
[docs] def get_names(self):
"""Get names of selected features.
Returns:
list[str]: List of the names of features selected.
"""
selected_masks = self._component_obj.get_support()
return [
feature_name
for (selected, feature_name) in zip(
selected_masks,
self.input_feature_names,
)
if selected
]
def _handle_partial_dependence_fast_mode(
self,
pipeline_parameters,
X=None,
target=None,
):
"""Updates pipeline parameters to not drop any features based off of feature importance.
This is needed, because fast mode refits cloned pipelines on single columns,
so categorical columns that have one-hot encoding applied may lose some
of their encoded columns with the default parameters. Therefore, we update the
parameters here to not drop any columns, and use the original
pipeline to determine if that feature gets dropped or not.
Args:
pipeline_parameters (dict): Pipeline parameters that will be used to create the pipelines
used in partial dependence fast mode.
X (pd.DataFrame, optional): Holdout data being used for partial dependence calculations.
target (str, optional): The target whose values we are trying to predict.
Return:
pipeline_parameters (dict): Pipeline parameters updated to allow the FeatureSelector component
to not drop any features.
"""
# Raise the percent of features we want to keep to not lose any
pipeline_parameters[self.name]["percent_features"] = 1.0
# Lower the threshold for feature importance above which we keep features
pipeline_parameters[self.name]["threshold"] = 0.0
return pipeline_parameters