import pandas as pd
from sklearn.preprocessing import LabelEncoder
from evalml.pipelines import PipelineBase
from evalml.utils import infer_feature_types
[docs]class ClassificationPipeline(PipelineBase):
"""Pipeline subclass for all classification pipelines."""
[docs] def __init__(
self,
component_graph,
parameters=None,
custom_name=None,
random_seed=0,
):
self._encoder = LabelEncoder()
super().__init__(
component_graph,
custom_name=custom_name,
parameters=parameters,
random_seed=random_seed,
)
[docs] def fit(self, X, y):
"""Build a classification model. For string and categorical targets, classes are sorted
by sorted(set(y)) and then are mapped to values between 0 and n_classes-1.
Arguments:
X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]
y (pd.Series, np.ndarray): The target training labels of length [n_samples]
Returns:
self
"""
X = infer_feature_types(X)
y = infer_feature_types(y)
self._encoder.fit(y)
y = self._encode_targets(y)
self._fit(X, y)
return self
def _encode_targets(self, y):
"""Converts target values from their original values to integer values that can be processed."""
try:
return pd.Series(self._encoder.transform(y), index=y.index, name=y.name)
except ValueError as e:
raise ValueError(str(e))
def _decode_targets(self, y):
"""Converts encoded numerical values to their original target values.
Note: we cast y as ints first to address boolean values that may be returned from
calculating predictions which we would not be able to otherwise transform if we
originally had integer targets."""
return self._encoder.inverse_transform(y.astype(int))
@property
def classes_(self):
"""Gets the class names for the problem."""
if not hasattr(self._encoder, "classes_"):
raise AttributeError(
"Cannot access class names before fitting the pipeline."
)
return self._encoder.classes_
def _predict(self, X, objective=None):
"""Make predictions using selected features.
Arguments:
X (pd.DataFrame): Data of shape [n_samples, n_features]
objective (Object or string): The objective to use to make predictions
Returns:
pd.Series: Estimated labels
"""
return self.component_graph.predict(X)
[docs] def predict(self, X, objective=None):
"""Make predictions using selected features.
Arguments:
X (pd.DataFrame, or np.ndarray): Data of shape [n_samples, n_features]
objective (Object or string): The objective to use to make predictions
Returns:
pd.Series: Estimated labels
"""
predictions = self._predict(X, objective=objective)
predictions = pd.Series(
self._decode_targets(predictions), name=self.input_target_name
)
return infer_feature_types(predictions)
[docs] def predict_proba(self, X):
"""Make probability estimates for labels.
Arguments:
X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features]
Returns:
pd.DataFrame: Probability estimates
"""
X = self.compute_estimator_features(X, y=None)
proba = self.estimator.predict_proba(X)
proba = proba.ww.rename(
columns={
col: new_col
for col, new_col in zip(proba.columns, self._encoder.classes_)
}
)
return infer_feature_types(proba)
[docs] def score(self, X, y, objectives):
"""Evaluate model performance on objectives
Arguments:
X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features]
y (pd.Series, or np.ndarray): True labels of length [n_samples]
objectives (list): List of objectives to score
Returns:
dict: Ordered dictionary of objective scores
"""
y = infer_feature_types(y)
objectives = self.create_objectives(objectives)
y = self._encode_targets(y)
y_predicted, y_predicted_proba = self._compute_predictions(X, y, objectives)
return self._score_all_objectives(
X, y, y_predicted, y_predicted_proba, objectives
)
def _compute_predictions(self, X, y, objectives, time_series=False):
"""Compute predictions/probabilities based on objectives."""
y_predicted = None
y_predicted_proba = None
if any(o.score_needs_proba for o in objectives):
y_predicted_proba = (
self.predict_proba(X, y) if time_series else self.predict_proba(X)
)
if any(not o.score_needs_proba for o in objectives):
y_predicted = (
self._predict(X, y, pad=True) if time_series else self._predict(X)
)
return y_predicted, y_predicted_proba