Source code for evalml.pipelines.pipeline_base

from collections import OrderedDict

import pandas as pd
from sklearn.model_selection import train_test_split

from .components import Estimator, handle_component
from .pipeline_plots import PipelinePlots

from evalml.objectives import get_objective
from evalml.utils import Logger


[docs]class PipelineBase:

    # Necessary for "Plotting" documentation, since Sphinx does not work well with instance attributes.
    plot = PipelinePlots

[docs]    def __init__(self, objective, component_list, n_jobs, random_state):
        """Machine learning pipeline made out of transformers and a estimator.

        Arguments:
            objective (Object): the objective to optimize

            component_list (list): List of components in order

            random_state (int): random seed/state

            n_jobs (int): Number of jobs to run in parallel
        """

        self.objective = get_objective(objective)
        self.random_state = random_state
        self.component_list = [handle_component(component) for component in component_list]
        self.component_names = [comp.name for comp in self.component_list]
        self.input_feature_names = {}
        # check if one and only estimator in pipeline is the last element in component_list

        if not isinstance(self.component_list[-1], Estimator):
            raise ValueError("A pipeline must have an Estimator as the last component in component_list.")

        self.estimator = self.component_list[-1]
        self.problem_types = self.estimator.problem_types
        self.model_type = self.estimator.model_type

        self.name = self._generate_name()  # autogenerated
        self.results = {}
        self.n_jobs = n_jobs
        self.parameters = {}
        for component in self.component_list:
            self.parameters.update(component.parameters)

        self.plot = PipelinePlots(self)
        self.logger = Logger()

    def __getitem__(self, index):
        if isinstance(index, slice):
            raise NotImplementedError('Slicing pipelines is currently not supported.')
        elif isinstance(index, int):
            return self.component_list[index]
        else:
            return self.get_component(index)

    def __setitem__(self, index, value):
        raise NotImplementedError('Setting pipeline components is not supported.')

    def _generate_name(self):
        if self.estimator is not None:
            name = "{}".format(self.estimator.name)
        else:
            name = "Pipeline"
        for index, component in enumerate(self.component_list[:-1]):
            if index == 0:
                name += " w/ {}".format(component.name)
            else:
                name += " + {}".format(component.name)

        return name

[docs]    def get_component(self, name):
        """Returns component by name

        Arguments:
            name (str): name of component

        Returns:
            Component: component to return

        """
        return next((component for component in self.component_list if component.name == name), None)

[docs]    def describe(self, return_dict=False):
        """Outputs pipeline details including component parameters

        Arguments:
            return_dict (bool): If True, return dictionary of information about pipeline. Defaults to false

        Returns:
            dict: dictionary of all component parameters if return_dict is True, else None
        """
        self.logger.log_title(self.name)
        self.logger.log("Problem Types: {}".format(', '.join([str(problem_type) for problem_type in self.problem_types])))
        self.logger.log("Model Type: {}".format(str(self.model_type)))
        better_string = "lower is better"
        if self.objective.greater_is_better:
            better_string = "greater is better"
        objective_string = "Objective to Optimize: {} ({})".format(self.objective.name, better_string)
        self.logger.log(objective_string)

        if self.estimator.name in self.input_feature_names:
            self.logger.log("Number of features: {}".format(len(self.input_feature_names[self.estimator.name])))

        # Summary of steps
        self.logger.log_subtitle("Pipeline Steps")
        for number, component in enumerate(self.component_list, 1):
            component_string = str(number) + ". " + component.name
            self.logger.log(component_string)
            component.describe(print_name=False)

        if return_dict:
            return self.parameters

    def _transform(self, X):
        X_t = X
        for component in self.component_list[:-1]:
            X_t = component.transform(X_t)
        return X_t

    def _fit(self, X, y):
        X_t = X
        y_t = y
        for component in self.component_list[:-1]:
            self.input_feature_names.update({component.name: list(pd.DataFrame(X_t))})
            if component._needs_fitting:
                X_t = component.fit_transform(X_t, y_t)
            else:
                X_t = component.transform(X_t, y_t)
        self.input_feature_names.update({self.estimator.name: list(pd.DataFrame(X_t))})
        self.estimator.fit(X_t, y_t)

[docs]    def fit(self, X, y, objective_fit_size=.2):
        """Build a model

        Arguments:
            X (pd.DataFrame or np.array): the input training data of shape [n_samples, n_features]

            y (pd.Series): the target training labels of length [n_samples]

            feature_types (list, optional): list of feature types. either numeric of categorical.
                categorical features will automatically be encoded

        Returns:

            self

        """
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)

        if not isinstance(y, pd.Series):
            y = pd.Series(y)

        if self.objective.needs_fitting:
            X, X_objective, y, y_objective = train_test_split(X, y, test_size=objective_fit_size, random_state=self.random_state)

        self._fit(X, y)

        if self.objective.needs_fitting:
            if self.objective.fit_needs_proba:
                y_predicted = self.predict_proba(X_objective)
            else:
                y_predicted = self.predict(X_objective)

            if self.objective.uses_extra_columns:
                self.objective.fit(y_predicted, y_objective, X_objective)
            else:
                self.objective.fit(y_predicted, y_objective)
        return self

[docs]    def predict(self, X):
        """Make predictions using selected features.

        Args:
            X (pd.DataFrame or np.array) : data of shape [n_samples, n_features]

        Returns:
            pd.Series : estimated labels
        """
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)

        X_t = self._transform(X)

        if self.objective and self.objective.needs_fitting:
            if self.objective.fit_needs_proba:
                y_predicted = self.predict_proba(X)
            else:
                X_t = self._transform(X)
                y_predicted = self.estimator.predict(X_t)

            if self.objective.uses_extra_columns:
                return self.objective.predict(y_predicted, X)

            return self.objective.predict(y_predicted)

        return self.estimator.predict(X_t)

[docs]    def predict_proba(self, X):
        """Make probability estimates for labels.

        Args:
            X (pd.DataFrame or np.array) : data of shape [n_samples, n_features]

        Returns:
            pd.DataFrame : probability estimates
        """
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)

        X = self._transform(X)
        proba = self.estimator.predict_proba(X)

        if proba.shape[1] <= 2:
            return proba[:, 1]
        else:
            return proba

[docs]    def score(self, X, y, other_objectives=None):
        """Evaluate model performance on current and additional objectives

        Args:
            X (pd.DataFrame or np.array) : data of shape [n_samples, n_features]
            y (pd.Series) : true labels of length [n_samples]
            other_objectives (list): list of other objectives to score

        Returns:
            float, dict:  score, ordered dictionary of other objective scores
        """
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)

        if not isinstance(y, pd.Series):
            y = pd.Series(y)

        other_objectives = other_objectives or []
        other_objectives = [get_objective(o) for o in other_objectives]
        y_predicted = None
        y_predicted_proba = None

        scores = []
        for objective in [self.objective] + other_objectives:
            if objective.score_needs_proba:
                if y_predicted_proba is None:
                    y_predicted_proba = self.predict_proba(X)
                y_predictions = y_predicted_proba
            else:
                if y_predicted is None:
                    y_predicted = self.predict(X)
                y_predictions = y_predicted

            if objective.uses_extra_columns:
                scores.append(objective.score(y_predictions, y, X))
            else:
                scores.append(objective.score(y_predictions, y))
        if not other_objectives:
            return scores[0], {}

        other_scores = OrderedDict(zip([n.name for n in other_objectives], scores[1:]))

        return scores[0], other_scores

    @property
    def feature_importances(self):
        """Return feature importances. Feature dropped by feaure selection are excluded"""
        feature_names = self.input_feature_names[self.estimator.name]
        importances = list(zip(feature_names, self.estimator.feature_importances))  # note: this only works for binary
        importances.sort(key=lambda x: -abs(x[1]))
        df = pd.DataFrame(importances, columns=["feature", "importance"])
        return df