Source code for evalml.pipelines.components.utils

"""Utility methods for EvalML components."""
import inspect

from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.utils.multiclass import unique_labels
from sklearn.utils.validation import check_is_fitted

from evalml.exceptions import MissingComponentError
from evalml.model_family.utils import ModelFamily, handle_model_family
from evalml.pipelines.components.component_base import ComponentBase
from evalml.pipelines.components.estimators.estimator import Estimator
from evalml.pipelines.components.transformers.transformer import Transformer
from evalml.problem_types import ProblemTypes, handle_problem_types
from evalml.utils import get_importable_subclasses


def _all_estimators():
    return get_importable_subclasses(Estimator, used_in_automl=False)


def _all_estimators_used_in_search():
    return get_importable_subclasses(Estimator, used_in_automl=True)


def _all_transformers():
    return get_importable_subclasses(Transformer, used_in_automl=False)


[docs]def all_components():
    """Get all available components."""
    return _all_estimators() + _all_transformers()


[docs]def allowed_model_families(problem_type):
    """List the model types allowed for a particular problem type.

    Args:
        problem_type (ProblemTypes or str): ProblemTypes enum or string.

    Returns:
        list[ModelFamily]: A list of model families.
    """
    estimators = []
    problem_type = handle_problem_types(problem_type)
    for estimator in _all_estimators_used_in_search():
        if problem_type in set(
            handle_problem_types(problem)
            for problem in estimator.supported_problem_types
        ):
            estimators.append(estimator)

    return list(set([e.model_family for e in estimators]))


[docs]def get_estimators(problem_type, model_families=None):
    """Returns the estimators allowed for a particular problem type.

    Can also optionally filter by a list of model types.

    Args:
        problem_type (ProblemTypes or str): Problem type to filter for.
        model_families (list[ModelFamily] or list[str]): Model families to filter for.

    Returns:
        list[class]: A list of estimator subclasses.

    Raises:
        TypeError: If the model_families parameter is not a list.
        RuntimeError: If a model family is not valid for the problem type.
    """
    if model_families is not None and not isinstance(model_families, list):
        raise TypeError("model_families parameter is not a list.")
    problem_type = handle_problem_types(problem_type)
    if model_families is None:
        model_families = allowed_model_families(problem_type)

    model_families = [
        handle_model_family(model_family) for model_family in model_families
    ]
    all_model_families = allowed_model_families(problem_type)
    for model_family in model_families:
        if model_family not in all_model_families:
            raise RuntimeError(
                "Unrecognized model type for problem type %s: %s"
                % (problem_type, model_family),
            )

    estimator_classes = []
    for estimator_class in _all_estimators_used_in_search():
        if problem_type not in [
            handle_problem_types(supported_pt)
            for supported_pt in estimator_class.supported_problem_types
        ]:
            continue
        if estimator_class.model_family not in model_families:
            continue
        estimator_classes.append(estimator_class)
    return estimator_classes


[docs]def estimator_unable_to_handle_nans(estimator_class):
    """If True, provided estimator class is unable to handle NaN values as an input.

    Args:
        estimator_class (Estimator): Estimator class

    Raises:
        ValueError: If estimator is not a valid estimator class.

    Returns:
        bool: True if estimator class is unable to process NaN values, False otherwise.
    """
    if not hasattr(estimator_class, "model_family"):
        raise ValueError("`estimator_class` must have a `model_family` attribute.")
    return estimator_class.model_family in [
        ModelFamily.EXTRA_TREES,
        ModelFamily.RANDOM_FOREST,
        ModelFamily.LINEAR_MODEL,
        ModelFamily.DECISION_TREE,
    ]


[docs]def handle_component_class(component_class):
    """Standardizes input from a string name to a ComponentBase subclass if necessary.

    If a str is provided, will attempt to look up a ComponentBase class by that name and
    return a new instance. Otherwise if a ComponentBase subclass or Component instance is provided,
    will return that without modification.

    Args:
        component_class (str, ComponentBase): Input to be standardized.

    Returns:
        ComponentBase

    Raises:
        ValueError: If input is not a valid component class.
        MissingComponentError: If the component cannot be found.

    Examples:
        >>> from evalml.pipelines.components.estimators.regressors.decision_tree_regressor import DecisionTreeRegressor
        >>> handle_component_class(DecisionTreeRegressor)
        <class 'evalml.pipelines.components.estimators.regressors.decision_tree_regressor.DecisionTreeRegressor'>
        >>> handle_component_class("Random Forest Regressor")
        <class 'evalml.pipelines.components.estimators.regressors.rf_regressor.RandomForestRegressor'>
    """
    if isinstance(component_class, ComponentBase) or (
        inspect.isclass(component_class) and issubclass(component_class, ComponentBase)
    ):
        return component_class
    if not isinstance(component_class, str):
        raise ValueError(
            (
                "component_class may only contain str or ComponentBase subclasses, not '{}'"
            ).format(type(component_class)),
        )
    component_classes = {component.name: component for component in all_components()}
    if component_class not in component_classes:
        raise MissingComponentError(
            'Component "{}" was not found'.format(component_class),
        )
    component_class = component_classes[component_class]
    return component_class


[docs]def drop_natural_language_columns(X):
    """Drops natural language columns from dataframes for the imputers.

    Args:
        X (pd.Dataframe): The dataframe that we want to impute on.

    Returns:
        pd.Dataframe: the dataframe with any natural language columns dropped.
        list: list of all the columns that are considered natural language.
    """
    natural_language_columns = list(
        X.ww.select(["NaturalLanguage"], return_schema=True).columns.keys(),
    )
    if natural_language_columns:
        X = X.ww.copy()
        X = X.ww.drop(columns=natural_language_columns)
    return X, natural_language_columns


[docs]def set_boolean_columns_to_categorical(X):
    """Sets boolean columns to categorical for the imputer.

    Args:
        X (pd.Dataframe): The dataframe that we want to impute on.

    Returns:
        pd.Dataframe: the dataframe with any of its ww columns that are boolean set to categorical.
    """
    X = X.ww.copy()
    X_schema = X.ww.schema
    original_X_schema = X_schema.get_subset_schema(
        subset_cols=X_schema._filter_cols(exclude=["Boolean"]),
    )
    X_boolean_cols = X_schema._filter_cols(include=["Boolean"])
    new_ltypes_for_boolean_cols = {col: "Categorical" for col in X_boolean_cols}
    X.ww.init(schema=original_X_schema, logical_types=new_ltypes_for_boolean_cols)
    return X


[docs]class WrappedSKClassifier(BaseEstimator, ClassifierMixin):
    """Scikit-learn classifier wrapper class."""

    def __init__(self, pipeline):
        """Scikit-learn classifier wrapper class. Takes an EvalML pipeline as input and returns a scikit-learn classifier class wrapping that pipeline.

        Args:
            pipeline (PipelineBase or subclass obj): EvalML pipeline.
        """
        self.pipeline = pipeline
        self._estimator_type = "classifier"
        if pipeline._is_fitted:
            self._is_fitted = True
            self.classes_ = pipeline.classes_

[docs]    def fit(self, X, y):
        """Fits component to data.

        Args:
            X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features].
            y (pd.Series, optional): The target training data of length [n_samples].

        Returns:
            self
        """
        self.classes_ = unique_labels(y)
        self.X_ = X
        self.y_ = y
        self.is_fitted_ = True
        self.pipeline.fit(X, y)
        return self

[docs]    def predict(self, X):
        """Make predictions using selected features.

        Args:
            X (pd.DataFrame): Features

        Returns:
            np.ndarray: Predicted values.
        """
        check_is_fitted(self, "is_fitted_")

        return self.pipeline.predict(X).to_numpy()

[docs]    def predict_proba(self, X):
        """Make probability estimates for labels.

        Args:
            X (pd.DataFrame): Features.

        Returns:
            np.ndarray: Probability estimates.
        """
        return self.pipeline.predict_proba(X).to_numpy()


[docs]class WrappedSKRegressor(BaseEstimator, RegressorMixin):
    """Scikit-learn regressor wrapper class."""

    def __init__(self, pipeline):
        """Scikit-learn regressor wrapper class. Takes an EvalML pipeline as input and returns a scikit-learn regressor class wrapping that pipeline.

        Args:
            pipeline (PipelineBase or subclass obj): EvalML pipeline.
        """
        self.pipeline = pipeline
        self._estimator_type = "regressor"
        self._is_fitted_ = True  # We need an attribute that ends in an underscore for scikit-learn to treat as fitted

[docs]    def fit(self, X, y):
        """Fits component to data.

        Args:
            X (pd.DataFrame or np.ndarray): the input training data of shape [n_samples, n_features]
            y (pd.Series, optional): the target training data of length [n_samples]

        Returns:
            self
        """
        self.pipeline.fit(X, y)
        return self

[docs]    def predict(self, X):
        """Make predictions using selected features.

        Args:
            X (pd.DataFrame): Features.

        Returns:
            np.ndarray: Predicted values.
        """
        return self.pipeline.predict(X).to_numpy()


[docs]def scikit_learn_wrapped_estimator(evalml_obj):
    """Wraps an EvalML object as a scikit-learn estimator."""
    from evalml.pipelines.pipeline_base import PipelineBase

    """Wrap an EvalML pipeline or estimator in a scikit-learn estimator."""
    if isinstance(evalml_obj, PipelineBase):
        if evalml_obj.problem_type in [
            ProblemTypes.REGRESSION,
            ProblemTypes.TIME_SERIES_REGRESSION,
        ]:
            return WrappedSKRegressor(evalml_obj)
        elif (
            evalml_obj.problem_type == ProblemTypes.BINARY
            or evalml_obj.problem_type == ProblemTypes.MULTICLASS
        ):
            return WrappedSKClassifier(evalml_obj)
    else:
        # EvalML Estimator
        if evalml_obj.supported_problem_types == [
            ProblemTypes.REGRESSION,
            ProblemTypes.TIME_SERIES_REGRESSION,
        ]:
            return WrappedSKRegressor(evalml_obj)
        elif evalml_obj.supported_problem_types == [
            ProblemTypes.BINARY,
            ProblemTypes.MULTICLASS,
            ProblemTypes.TIME_SERIES_BINARY,
            ProblemTypes.TIME_SERIES_MULTICLASS,
        ]:
            return WrappedSKClassifier(evalml_obj)
    raise ValueError("Could not wrap EvalML object in scikit-learn wrapper.")


[docs]def generate_component_code(element):
    r"""Creates and returns a string that contains the Python imports and code required for running the EvalML component.

    Args:
        element (component instance): The instance of the component to generate string Python code for.

    Returns:
        String representation of Python code that can be run separately in order to recreate the component instance.
        Does not include code for custom component implementation.

    Raises:
        ValueError: If the input element is not a component instance.

    Examples:
        >>> from evalml.pipelines.components.estimators.regressors.decision_tree_regressor import DecisionTreeRegressor
        >>> assert generate_component_code(DecisionTreeRegressor()) == "from evalml.pipelines.components.estimators.regressors.decision_tree_regressor import DecisionTreeRegressor\n\ndecisionTreeRegressor = DecisionTreeRegressor(**{'criterion': 'mse', 'max_features': 'auto', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0})"
        ...
        >>> from evalml.pipelines.components.transformers.imputers.simple_imputer import SimpleImputer
        >>> assert generate_component_code(SimpleImputer()) == "from evalml.pipelines.components.transformers.imputers.simple_imputer import SimpleImputer\n\nsimpleImputer = SimpleImputer(**{'impute_strategy': 'most_frequent', 'fill_value': None})"
    """
    # hold the imports needed and add code to end
    code_strings = []
    base_string = ""

    if not isinstance(element, ComponentBase):
        raise ValueError(
            "Element must be a component instance, received {}".format(type(element)),
        )

    if element.__class__ in all_components():
        code_strings.append(
            "from {} import {}\n".format(
                element.__class__.__module__,
                element.__class__.__name__,
            ),
        )
    component_parameters = element.parameters
    name = element.name[0].lower() + element.name[1:].replace(" ", "")
    base_string += "{0} = {1}(**{2})".format(
        name,
        element.__class__.__name__,
        component_parameters,
    )

    code_strings.append(base_string)
    return "\n".join(code_strings)


[docs]def make_balancing_dictionary(y, sampling_ratio):
    """Makes dictionary for oversampler components. Find ratio of each class to the majority. If the ratio is smaller than the sampling_ratio, we want to oversample, otherwise, we don't want to sample at all, and we leave the data as is.

    Args:
        y (pd.Series): Target data.
        sampling_ratio (float): The balanced ratio we want the samples to meet.

    Returns:
        dict: Dictionary where keys are the classes, and the corresponding values are the counts of samples
        for each class that will satisfy sampling_ratio.

    Raises:
        ValueError: If sampling ratio is not in the range (0, 1] or the target is empty.

    Examples:
        >>> import pandas as pd
        >>> y = pd.Series([1] * 4 + [2] * 8 + [3])
        >>> assert make_balancing_dictionary(y, 0.5) == {2: 8, 1: 4, 3: 4}
        >>> assert make_balancing_dictionary(y, 0.9) == {2: 8, 1: 7, 3: 7}
        >>> assert make_balancing_dictionary(y, 0.1) == {2: 8, 1: 4, 3: 1}
    """
    if sampling_ratio <= 0 or sampling_ratio > 1:
        raise ValueError(
            "Sampling ratio must be in range (0, 1], received {}".format(
                sampling_ratio,
            ),
        )
    if len(y) == 0:
        raise ValueError("Target data must not be empty")
    value_counts = y.value_counts()
    ratios = value_counts / value_counts.values[0]
    class_dic = {}
    sample_amount = int(value_counts.values[0] * sampling_ratio)
    for index, value in ratios.items():
        if value < sampling_ratio:
            # we want to oversample this class
            class_dic[index] = sample_amount
        else:
            # this class is already larger than the ratio, don't change
            class_dic[index] = value_counts[index]
    return class_dic