Source code for evalml.pipelines.classification_pipeline

import pandas as pd
from sklearn.preprocessing import LabelEncoder

from evalml.pipelines import PipelineBase
from evalml.utils import infer_feature_types


[docs]class ClassificationPipeline(PipelineBase): """Pipeline subclass for all classification pipelines."""
[docs] def __init__( self, component_graph, parameters=None, custom_name=None, random_seed=0, ): self._encoder = LabelEncoder() super().__init__( component_graph, custom_name=custom_name, parameters=parameters, random_seed=random_seed, )
[docs] def fit(self, X, y): """Build a classification model. For string and categorical targets, classes are sorted by sorted(set(y)) and then are mapped to values between 0 and n_classes-1. Arguments: X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] y (pd.Series, np.ndarray): The target training labels of length [n_samples] Returns: self """ X = infer_feature_types(X) y = infer_feature_types(y) self._encoder.fit(y) y = self._encode_targets(y) self._fit(X, y) return self
def _encode_targets(self, y): """Converts target values from their original values to integer values that can be processed.""" try: return pd.Series(self._encoder.transform(y), index=y.index, name=y.name) except ValueError as e: raise ValueError(str(e)) def _decode_targets(self, y): """Converts encoded numerical values to their original target values. Note: we cast y as ints first to address boolean values that may be returned from calculating predictions which we would not be able to otherwise transform if we originally had integer targets.""" return self._encoder.inverse_transform(y.astype(int)) @property def classes_(self): """Gets the class names for the problem.""" if not hasattr(self._encoder, "classes_"): raise AttributeError( "Cannot access class names before fitting the pipeline." ) return self._encoder.classes_ def _predict(self, X, objective=None): """Make predictions using selected features. Arguments: X (pd.DataFrame): Data of shape [n_samples, n_features] objective (Object or string): The objective to use to make predictions Returns: pd.Series: Estimated labels """ return self.component_graph.predict(X)
[docs] def predict(self, X, objective=None): """Make predictions using selected features. Arguments: X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features] objective (Object or string): The objective to use to make predictions Returns: pd.Series: Estimated labels """ predictions = self._predict(X, objective=objective) predictions = pd.Series( self._decode_targets(predictions), name=self.input_target_name ) return infer_feature_types(predictions)
[docs] def predict_proba(self, X): """Make probability estimates for labels. Arguments: X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features] Returns: pd.DataFrame: Probability estimates """ X = self.compute_estimator_features(X, y=None) proba = self.estimator.predict_proba(X) proba = proba.ww.rename( columns={ col: new_col for col, new_col in zip(proba.columns, self._encoder.classes_) } ) return infer_feature_types(proba)
[docs] def score(self, X, y, objectives): """Evaluate model performance on objectives Arguments: X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features] y (pd.Series, or np.ndarray): True labels of length [n_samples] objectives (list): List of objectives to score Returns: dict: Ordered dictionary of objective scores """ y = infer_feature_types(y) objectives = self.create_objectives(objectives) y = self._encode_targets(y) y_predicted, y_predicted_proba = self._compute_predictions(X, y, objectives) return self._score_all_objectives( X, y, y_predicted, y_predicted_proba, objectives )
def _compute_predictions(self, X, y, objectives, time_series=False): """Compute predictions/probabilities based on objectives.""" y_predicted = None y_predicted_proba = None if any(o.score_needs_proba for o in objectives): y_predicted_proba = ( self.predict_proba(X, y) if time_series else self.predict_proba(X) ) if any(not o.score_needs_proba for o in objectives): y_predicted = ( self._predict(X, y, pad=True) if time_series else self._predict(X) ) return y_predicted, y_predicted_proba