import pandas as pd
from sklearn.decomposition import PCA as SkPCA
from skopt.space import Real
from evalml.pipelines.components.transformers import Transformer
from evalml.utils import (
_retain_custom_types_and_initalize_woodwork,
infer_feature_types,
is_all_numeric,
)
[docs]class PCA(Transformer):
"""
Reduces the number of features by using Principal Component Analysis (PCA).
Arguments:
variance (float): The percentage of the original data variance that should be preserved when reducing the
number of features. Defaults to 0.95.
n_components (int): The number of features to maintain after computing SVD. Defaults to None, but will override
variance variable if set.
random_seed (int): Seed for the random number generator. Defaults to 0.
"""
name = "PCA Transformer"
hyperparameter_ranges = {"variance": Real(0.25, 1)}
"""{"variance": Real(0.25, 1)}"""
def __init__(self, variance=0.95, n_components=None, random_seed=0, **kwargs):
parameters = {"variance": variance, "n_components": n_components}
parameters.update(kwargs)
if n_components:
pca = SkPCA(n_components=n_components, random_state=random_seed, **kwargs)
else:
pca = SkPCA(n_components=variance, random_state=random_seed, **kwargs)
super().__init__(
parameters=parameters, component_obj=pca, random_seed=random_seed
)
[docs] def fit(self, X, y=None):
X = infer_feature_types(X)
if not is_all_numeric(X):
raise ValueError("PCA input must be all numeric")
self._component_obj.fit(X)
return self