Source code for evalml.pipelines.components.transformers.dimensionality_reduction.pca
"""Component that reduces the number of features by using Principal Component Analysis (PCA)."""
import pandas as pd
from sklearn.decomposition import PCA as SkPCA
from skopt.space import Real
from evalml.pipelines.components.transformers import Transformer
from evalml.utils import (
_retain_custom_types_and_initalize_woodwork,
infer_feature_types,
is_all_numeric,
)
[docs]class PCA(Transformer):
"""Reduces the number of features by using Principal Component Analysis (PCA).
Args:
variance (float): The percentage of the original data variance that should be preserved when reducing the
number of features. Defaults to 0.95.
n_components (int): The number of features to maintain after computing SVD. Defaults to None, but will override
variance variable if set.
random_seed (int): Seed for the random number generator. Defaults to 0.
"""
name = "PCA Transformer"
hyperparameter_ranges = {"variance": Real(0.25, 1)}
"""{"variance": Real(0.25, 1)}"""
def __init__(self, variance=0.95, n_components=None, random_seed=0, **kwargs):
parameters = {"variance": variance, "n_components": n_components}
parameters.update(kwargs)
if n_components:
pca = SkPCA(n_components=n_components, random_state=random_seed, **kwargs)
else:
pca = SkPCA(n_components=variance, random_state=random_seed, **kwargs)
super().__init__(
parameters=parameters, component_obj=pca, random_seed=random_seed
)
[docs] def fit(self, X, y=None):
"""Fits the PCA component.
Args:
X (pd.DataFrame): The input training data of shape [n_samples, n_features].
y (pd.Series, optional): The target training data of length [n_samples].
Returns:
self
Raises:
ValueError: If input data is not all numeric.
"""
X = infer_feature_types(X)
if not is_all_numeric(X):
raise ValueError("PCA input must be all numeric")
self._component_obj.fit(X)
return self
[docs] def transform(self, X, y=None):
"""Transform data using fitted PCA component.
Args:
X (pd.DataFrame): The input training data of shape [n_samples, n_features].
y (pd.Series, optional): The target training data of length [n_samples].
Returns:
pd.DataFrame: Transformed data.
Raises:
ValueError: If input data is not all numeric.
"""
X_ww = infer_feature_types(X)
if not is_all_numeric(X_ww):
raise ValueError("PCA input must be all numeric")
X_t = self._component_obj.transform(X)
X_t = pd.DataFrame(
X_t,
index=X_ww.index,
columns=[f"component_{i}" for i in range(X_t.shape[1])],
)
return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)
[docs] def fit_transform(self, X, y=None):
"""Fit and transform data using the PCA component.
Args:
X (pd.DataFrame): The input training data of shape [n_samples, n_features].
y (pd.Series, optional): The target training data of length [n_samples].
Returns:
pd.DataFrame: Transformed data.
Raises:
ValueError: If input data is not all numeric.
"""
X_ww = infer_feature_types(X)
if not is_all_numeric(X_ww):
raise ValueError("PCA input must be all numeric")
X_t = self._component_obj.fit_transform(X, y)
X_t = pd.DataFrame(
X_t,
index=X_ww.index,
columns=[f"component_{i}" for i in range(X_t.shape[1])],
)
return _retain_custom_types_and_initalize_woodwork(X_ww, X_t)