Source code for evalml.pipelines.components.component_base

"""Base class for all components."""
import copy
from abc import ABC, abstractmethod

import cloudpickle

from evalml.exceptions import MethodPropertyNotFoundError
from evalml.pipelines.components.component_base_meta import ComponentBaseMeta
from evalml.utils import (
    _downcast_nullable_X,
    _downcast_nullable_y,
    classproperty,
    infer_feature_types,
    log_subtitle,
    safe_repr,
)
from evalml.utils.logger import get_logger


[docs]class ComponentBase(ABC, metaclass=ComponentBaseMeta): """Base class for all components. Args: parameters (dict): Dictionary of parameters for the component. Defaults to None. component_obj (obj): Third-party objects useful in component implementation. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. """ _default_parameters = None _can_be_used_for_fast_partial_dependence = True # Referring to the pandas nullable dtypes; not just woodwork logical types _integer_nullable_incompatibilities = [] _boolean_nullable_incompatibilities = [] is_multiseries = False def __init__(self, parameters=None, component_obj=None, random_seed=0, **kwargs): """Base class for all components. Args: parameters (dict): Dictionary of parameters for the component. Defaults to None. component_obj (obj): Third-party objects useful in component implementation. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. kwargs (Any): Any keyword arguments to pass into the component. """ self.random_seed = random_seed self._component_obj = component_obj self._parameters = parameters or {} self._is_fitted = False @property @classmethod @abstractmethod def name(cls): """Returns string name of this component.""" @property @classmethod @abstractmethod def modifies_features(cls): """Returns whether this component modifies (subsets or transforms) the features variable during transform. For Estimator objects, this attribute determines if the return value from `predict` or `predict_proba` should be used as features or targets. """ @property @classmethod @abstractmethod def modifies_target(cls): """Returns whether this component modifies (subsets or transforms) the target variable during transform. For Estimator objects, this attribute determines if the return value from `predict` or `predict_proba` should be used as features or targets. """ @property @classmethod @abstractmethod def training_only(cls): """Returns whether or not this component should be evaluated during training-time only, or during both training and prediction time.""" @classproperty def needs_fitting(self): """Returns boolean determining if component needs fitting before calling predict, predict_proba, transform, or feature_importances. This can be overridden to False for components that do not need to be fit or whose fit methods do nothing. Returns: True. """ return True @property def parameters(self): """Returns the parameters which were used to initialize the component.""" return copy.copy(self._parameters) @classproperty def default_parameters(cls): """Returns the default parameters for this component. Our convention is that Component.default_parameters == Component().parameters. Returns: dict: Default parameters for this component. """ if cls._default_parameters is None: cls._default_parameters = cls().parameters return cls._default_parameters @classproperty def _supported_by_list_API(cls): return not cls.modifies_target def _handle_partial_dependence_fast_mode( self, pipeline_parameters, X=None, target=None, ): """Determines whether or not a component can be used with partial dependence's fast mode. Args: pipeline_parameters (dict): Pipeline parameters that will be used to create the pipelines used in partial dependence fast mode. X (pd.DataFrame, optional): Holdout data being used for partial dependence calculations. target (str, optional): The target whose values we are trying to predict. """ if self._can_be_used_for_fast_partial_dependence: return pipeline_parameters raise TypeError( f"Component {self.name} cannot run partial dependence fast mode.", )
[docs] def clone(self): """Constructs a new component with the same parameters and random state. Returns: A new instance of this component with identical parameters and random state. """ return self.__class__(**self.parameters, random_seed=self.random_seed)
[docs] def fit(self, X, y=None): """Fits component to data. Args: X (pd.DataFrame): The input training data of shape [n_samples, n_features] y (pd.Series, optional): The target training data of length [n_samples] Returns: self Raises: MethodPropertyNotFoundError: If component does not have a fit method or a component_obj that implements fit. """ X = infer_feature_types(X) if y is not None: y = infer_feature_types(y) try: self._component_obj.fit(X, y) return self except AttributeError: raise MethodPropertyNotFoundError( "Component requires a fit method or a component_obj that implements fit", )
[docs] def describe(self, print_name=False, return_dict=False): """Describe a component and its parameters. Args: print_name(bool, optional): whether to print name of component return_dict(bool, optional): whether to return description as dictionary in the format {"name": name, "parameters": parameters} Returns: None or dict: Returns dictionary if return_dict is True, else None. """ logger = get_logger(f"{__name__}.describe") if print_name: title = self.name log_subtitle(logger, title) for parameter in self.parameters: parameter_str = ("\t * {} : {}").format( parameter, self.parameters[parameter], ) logger.info(parameter_str) if return_dict: component_dict = {"name": self.name} component_dict.update({"parameters": self.parameters}) return component_dict
[docs] def save(self, file_path, pickle_protocol=cloudpickle.DEFAULT_PROTOCOL): """Saves component at file path. Args: file_path (str): Location to save file. pickle_protocol (int): The pickle data stream format. """ with open(file_path, "wb") as f: cloudpickle.dump(self, f, protocol=pickle_protocol)
[docs] @staticmethod def load(file_path): """Loads component at file path. Args: file_path (str): Location to load file. Returns: ComponentBase object """ with open(file_path, "rb") as f: return cloudpickle.load(f)
def __eq__(self, other): """Check for equality.""" if not isinstance(other, self.__class__): return False random_seed_eq = self.random_seed == other.random_seed if not random_seed_eq: return False attributes_to_check = ["_parameters", "_is_fitted"] for attribute in attributes_to_check: if getattr(self, attribute) != getattr(other, attribute): return False return True def __str__(self): """String representation of a component.""" return self.name def __repr__(self): """String representation of a component.""" parameters_repr = ", ".join( [f"{key}={safe_repr(value)}" for key, value in self.parameters.items()], ) return f"{(type(self).__name__)}({parameters_repr})"
[docs] def update_parameters(self, update_dict, reset_fit=True): """Updates the parameter dictionary of the component. Args: update_dict (dict): A dict of parameters to update. reset_fit (bool, optional): If True, will set `_is_fitted` to False. """ self._parameters.update(update_dict) if reset_fit: self._is_fitted = False
def _handle_nullable_types(self, X=None, y=None): """Transforms X and y to remove any incompatible nullable types according to a component's needs. Args: X (pd.DataFrame, optional): Input data to a component of shape [n_samples, n_features]. May contain nullable types. y (pd.Series, optional): The target of length [n_samples]. May contain nullable types. Returns: X, y with any incompatible nullable types downcasted to compatible equivalents. """ X_bool_incompatible = "X" in self._boolean_nullable_incompatibilities X_int_incompatible = "X" in self._integer_nullable_incompatibilities if X is not None and (X_bool_incompatible or X_int_incompatible): X = _downcast_nullable_X( X, handle_boolean_nullable=X_bool_incompatible, handle_integer_nullable=X_int_incompatible, ) y_bool_incompatible = "y" in self._boolean_nullable_incompatibilities y_int_incompatible = "y" in self._integer_nullable_incompatibilities if y is not None and (y_bool_incompatible or y_int_incompatible): y = _downcast_nullable_y( y, handle_boolean_nullable=y_bool_incompatible, handle_integer_nullable=y_int_incompatible, ) return X, y