Source code for evalml.pipelines.classification_pipeline

import pandas as pd
from sklearn.preprocessing import LabelEncoder

from evalml.pipelines import PipelineBase
from evalml.utils import infer_feature_types


[docs]class ClassificationPipeline(PipelineBase):
    """Pipeline subclass for all classification pipelines."""

[docs]    def __init__(
        self,
        component_graph,
        parameters=None,
        custom_name=None,
        random_seed=0,
    ):
        self._encoder = LabelEncoder()
        super().__init__(
            component_graph,
            custom_name=custom_name,
            parameters=parameters,
            random_seed=random_seed,
        )

[docs]    def fit(self, X, y):
        """Build a classification model. For string and categorical targets, classes are sorted
            by sorted(set(y)) and then are mapped to values between 0 and n_classes-1.

        Arguments:
            X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]
            y (pd.Series, np.ndarray): The target training labels of length [n_samples]

        Returns:
            self

        """
        X = infer_feature_types(X)
        y = infer_feature_types(y)
        self._encoder.fit(y)
        y = self._encode_targets(y)
        self._fit(X, y)
        return self

    def _encode_targets(self, y):
        """Converts target values from their original values to integer values that can be processed."""
        try:
            return pd.Series(self._encoder.transform(y), index=y.index, name=y.name)
        except ValueError as e:
            raise ValueError(str(e))

    def _decode_targets(self, y):
        """Converts encoded numerical values to their original target values.
        Note: we cast y as ints first to address boolean values that may be returned from
        calculating predictions which we would not be able to otherwise transform if we
        originally had integer targets."""
        return self._encoder.inverse_transform(y.astype(int))

    @property
    def classes_(self):
        """Gets the class names for the problem."""
        if not hasattr(self._encoder, "classes_"):
            raise AttributeError(
                "Cannot access class names before fitting the pipeline."
            )
        return self._encoder.classes_

    def _predict(self, X, objective=None):
        """Make predictions using selected features.

        Arguments:
            X (pd.DataFrame): Data of shape [n_samples, n_features]
            objective (Object or string): The objective to use to make predictions

        Returns:
            pd.Series: Estimated labels
        """
        return self.component_graph.predict(X)

[docs]    def predict(self, X, objective=None):
        """Make predictions using selected features.

        Arguments:
            X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features]
            objective (Object or string): The objective to use to make predictions

        Returns:
            pd.Series: Estimated labels
        """
        predictions = self._predict(X, objective=objective)
        predictions = pd.Series(
            self._decode_targets(predictions), name=self.input_target_name
        )
        return infer_feature_types(predictions)

[docs]    def predict_proba(self, X):
        """Make probability estimates for labels.

        Arguments:
            X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features]

        Returns:
            pd.DataFrame: Probability estimates
        """
        X = self.compute_estimator_features(X, y=None)
        proba = self.estimator.predict_proba(X)
        proba = proba.ww.rename(
            columns={
                col: new_col
                for col, new_col in zip(proba.columns, self._encoder.classes_)
            }
        )
        return infer_feature_types(proba)

[docs]    def score(self, X, y, objectives):
        """Evaluate model performance on objectives

        Arguments:
            X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features]
            y (pd.Series, or np.ndarray): True labels of length [n_samples]
            objectives (list): List of objectives to score

        Returns:
            dict: Ordered dictionary of objective scores
        """
        y = infer_feature_types(y)
        objectives = self.create_objectives(objectives)
        y = self._encode_targets(y)
        y_predicted, y_predicted_proba = self._compute_predictions(X, y, objectives)
        return self._score_all_objectives(
            X, y, y_predicted, y_predicted_proba, objectives
        )

    def _compute_predictions(self, X, y, objectives, time_series=False):
        """Compute predictions/probabilities based on objectives."""
        y_predicted = None
        y_predicted_proba = None
        if any(o.score_needs_proba for o in objectives):
            y_predicted_proba = (
                self.predict_proba(X, y) if time_series else self.predict_proba(X)
            )
        if any(not o.score_needs_proba for o in objectives):
            y_predicted = (
                self._predict(X, y, pad=True) if time_series else self._predict(X)
            )
        return y_predicted, y_predicted_proba