Source code for evalml.pipelines.classification_pipeline
"""Pipeline subclass for all classification pipelines."""importnumpyasnpimportpandasaspdfromevalml.pipelinesimportPipelineBasefromevalml.problem_typesimportis_binary,is_multiclassfromevalml.utilsimportinfer_feature_types
[docs]classClassificationPipeline(PipelineBase):"""Pipeline subclass for all classification pipelines. Args: component_graph (list or dict): List of components in order. Accepts strings or ComponentBase subclasses in the list. Note that when duplicate components are specified in a list, the duplicate component names will be modified with the component's index in the list. For example, the component graph [Imputer, One Hot Encoder, Imputer, Logistic Regression Classifier] will have names ["Imputer", "One Hot Encoder", "Imputer_2", "Logistic Regression Classifier"] parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values. An empty dictionary or None implies using all default values for component parameters. Defaults to None. custom_name (str): Custom name for the pipeline. Defaults to None. random_seed (int): Seed for the random number generator. Defaults to 0. """def__init__(self,component_graph,parameters=None,custom_name=None,random_seed=0,):self._classes_=Nonesuper().__init__(component_graph,custom_name=custom_name,parameters=parameters,random_seed=random_seed,)try:self._encoder=self.component_graph.get_component("Label Encoder")exceptValueError:self._encoder=None
[docs]deffit(self,X,y):"""Build a classification model. For string and categorical targets, classes are sorted by sorted(set(y)) and then are mapped to values between 0 and n_classes-1. Args: X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features] y (pd.Series, np.ndarray): The target training labels of length [n_samples] Returns: self Raises: ValueError: If the number of unique classes in y are not appropriate for the type of pipeline. TypeError: If the dtype is boolean but pd.NA exists in the series. Exception: For all other exceptions. """X=infer_feature_types(X)y=infer_feature_types(y)ifis_binary(self.problem_type)andy.nunique()!=2:raiseValueError("Binary pipelines require y to have 2 unique classes!")elifis_multiclass(self.problem_type)andy.nunique()in[1,2]:raiseValueError("Multiclass pipelines require y to have 3 or more unique classes!",)self._fit(X,y)# TODO: Added this in because numpy's unique() does not support pandas.NAtry:self._classes_=list(np.unique(y))exceptTypeErrorase:if"boolean value of NA is ambiguous"instr(e):self._classes_=y.unique()returnself
def_encode_targets(self,y):"""Converts target values from their original values to integer values that can be processed."""ifself._encoderisnotNone:try:returnpd.Series(self._encoder.transform(None,y)[1],index=y.index,name=y.name,).astype(int)exceptValueErrorase:raiseValueError(str(e))returny@propertydefclasses_(self):"""Gets the class names for the pipeline. Will return None before pipeline is fit."""returnself._classes_def_predict(self,X,objective=None):"""Make predictions using selected features. Args: X (pd.DataFrame): Data of shape [n_samples, n_features]. objective (Object or string): The objective to use to make predictions. Returns: pd.Series: Estimated labels """returnself.component_graph.predict(X)
[docs]defpredict(self,X,objective=None,X_train=None,y_train=None):"""Make predictions using selected features. Note: we cast y as ints first to address boolean values that may be returned from calculating predictions which we would not be able to otherwise transform if we originally had integer targets. Args: X (pd.DataFrame): Data of shape [n_samples, n_features]. objective (Object or string): The objective to use to make predictions. X_train (pd.DataFrame): Training data. Ignored. Only used for time series. y_train (pd.Series): Training labels. Ignored. Only used for time series. Returns: pd.Series: Estimated labels. """_predictions=self._predict(X,objective=objective)predictions=self.inverse_transform(_predictions.astype(int))predictions=pd.Series(predictions,name=self.input_target_name,index=_predictions.index,)returninfer_feature_types(predictions)
[docs]defpredict_proba(self,X,X_train=None,y_train=None):"""Make probability estimates for labels. Args: X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features] X_train (pd.DataFrame or np.ndarray or None): Training data. Ignored. Only used for time series. y_train (pd.Series or None): Training labels. Ignored. Only used for time series. Returns: pd.DataFrame: Probability estimates Raises: ValueError: If final component is not an estimator. """ifself.estimatorisNone:raiseValueError("Cannot call predict_proba() on a component graph because the final component is not an Estimator.",)X=self.transform_all_but_final(X,y=None)proba=self.estimator.predict_proba(X)proba=proba.ww.rename(columns={col:new_colforcol,new_colinzip(proba.columns,self.classes_)},)returninfer_feature_types(proba)
[docs]defscore(self,X,y,objectives,X_train=None,y_train=None):"""Evaluate model performance on objectives. Args: X (pd.DataFrame): Data of shape [n_samples, n_features] y (pd.Series): True labels of length [n_samples] objectives (list): List of objectives to score X_train (pd.DataFrame): Training data. Ignored. Only used for time series. y_train (pd.Series): Training labels. Ignored. Only used for time series. Returns: dict: Ordered dictionary of objective scores. """y=infer_feature_types(y)objectives=self.create_objectives(objectives)ifself._encoderisnotNone:y=self._encode_targets(y)y_predicted,y_predicted_proba=self._compute_predictions(X,y,objectives)returnself._score_all_objectives(X,y,y_predicted,y_predicted_proba,objectives,)
def_compute_predictions(self,X,y,objectives):"""Compute predictions/probabilities based on objectives."""y_predicted=Noney_predicted_proba=Noneifany(o.score_needs_probaforoinobjectives):y_predicted_proba=self.predict_proba(X)ifany(noto.score_needs_probaforoinobjectives):y_predicted=self._predict(X)returny_predicted,y_predicted_proba