Source code for evalml.pipelines.components.estimators.classifiers.baseline_classifier
"""Baseline classifier."""
import numpy as np
import pandas as pd
from evalml.model_family import ModelFamily
from evalml.pipelines.components.estimators import Estimator
from evalml.problem_types import ProblemTypes
from evalml.utils import get_random_state, infer_feature_types
[docs]class BaselineClassifier(Estimator):
"""Classifier that predicts using the specified strategy.
This is useful as a simple baseline classifier to compare with other classifiers.
Args:
strategy (str): Method used to predict. Valid options are "mode", "random" and "random_weighted". Defaults to "mode".
random_seed (int): Seed for the random number generator. Defaults to 0.
"""
name = "Baseline Classifier"
hyperparameter_ranges = {}
"""{}"""
model_family = ModelFamily.BASELINE
"""ModelFamily.BASELINE"""
supported_problem_types = [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]
"""[ProblemTypes.BINARY, ProblemTypes.MULTICLASS]"""
def __init__(self, strategy="mode", random_seed=0, **kwargs):
if strategy not in ["mode", "random", "random_weighted"]:
raise ValueError(
"'strategy' parameter must equal either 'mode', 'random', or 'random_weighted'",
)
parameters = {"strategy": strategy}
parameters.update(kwargs)
self._classes = None
self._percentage_freq = None
self._num_features = None
self._num_unique = None
self._mode = None
super().__init__(
parameters=parameters,
component_obj=None,
random_seed=random_seed,
)
[docs] def fit(self, X, y=None):
"""Fits baseline classifier component to data.
Args:
X (pd.DataFrame): The input training data of shape [n_samples, n_features].
y (pd.Series): The target training data of length [n_samples].
Returns:
self
Raises:
ValueError: If y is None.
"""
if y is None:
raise ValueError("Cannot fit Baseline classifier if y is None")
X = infer_feature_types(X)
y = infer_feature_types(y)
vals, counts = np.unique(y, return_counts=True)
self._classes = list(vals)
self._percentage_freq = counts.astype(float) / len(y)
self._num_unique = len(self._classes)
self._num_features = X.shape[1]
if self.parameters["strategy"] == "mode":
self._mode = y.mode()[0]
return self
[docs] def predict(self, X):
"""Make predictions using the baseline classification strategy.
Args:
X (pd.DataFrame): Data of shape [n_samples, n_features].
Returns:
pd.Series: Predicted values.
"""
X = infer_feature_types(X)
strategy = self.parameters["strategy"]
if strategy == "mode":
predictions = pd.Series([self._mode] * len(X))
elif strategy == "random":
predictions = get_random_state(self.random_seed).choice(
self._classes,
len(X),
)
else:
predictions = get_random_state(self.random_seed).choice(
self._classes,
len(X),
p=self._percentage_freq,
)
return infer_feature_types(predictions)
[docs] def predict_proba(self, X):
"""Make prediction probabilities using the baseline classification strategy.
Args:
X (pd.DataFrame): Data of shape [n_samples, n_features].
Returns:
pd.DataFrame: Predicted probability values.
"""
X = infer_feature_types(X)
strategy = self.parameters["strategy"]
if strategy == "mode":
mode_index = self._classes.index(self._mode)
proba_arr = np.array(
[[1.0 if i == mode_index else 0.0 for i in range(self._num_unique)]]
* len(X),
)
elif strategy == "random":
proba_arr = np.array(
[[1.0 / self._num_unique for i in range(self._num_unique)]] * len(X),
)
else:
proba_arr = np.array(
[[self._percentage_freq[i] for i in range(self._num_unique)]] * len(X),
)
predictions = pd.DataFrame(proba_arr, columns=self._classes)
return infer_feature_types(predictions)
@property
def feature_importance(self):
"""Returns importance associated with each feature. Since baseline classifiers do not use input features to calculate predictions, returns an array of zeroes.
Returns:
pd.Series: An array of zeroes
"""
return pd.Series(np.zeros(self._num_features))
@property
def classes_(self):
"""Returns class labels. Will return None before fitting.
Returns:
list[str] or list(float) : Class names
"""
return self._classes