Source code for evalml.pipelines.components.transformers.dimensionality_reduction.lda

"""Component that reduces the number of features by using Linear Discriminant Analysis."""
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as SkLDA

from evalml.pipelines.components.transformers import Transformer
from evalml.utils import infer_feature_types, is_all_numeric


[docs]class LinearDiscriminantAnalysis(Transformer):
    """Reduces the number of features by using Linear Discriminant Analysis.

    Args:
        n_components (int): The number of features to maintain after computation. Defaults to None.
        random_seed (int): Seed for the random number generator. Defaults to 0.
    """

    name = "Linear Discriminant Analysis Transformer"
    hyperparameter_ranges = {}
    """{}"""

    def __init__(self, n_components=None, random_seed=0, **kwargs):
        if n_components and n_components < 1:
            raise ValueError(
                "Invalid number of compponents for Linear Discriminant Analysis",
            )
        parameters = {"n_components": n_components}
        parameters.update(kwargs)
        lda = SkLDA(n_components=n_components, **kwargs)
        super().__init__(
            parameters=parameters,
            component_obj=lda,
            random_seed=random_seed,
        )

[docs]    def fit(self, X, y):
        """Fits the LDA component.

        Args:
            X (pd.DataFrame): The input training data of shape [n_samples, n_features].
            y (pd.Series, optional): The target training data of length [n_samples].

        Returns:
            self

        Raises:
            ValueError: If input data is not all numeric.
        """
        X = infer_feature_types(X)
        if not is_all_numeric(X):
            raise ValueError("LDA input must be all numeric")
        y = infer_feature_types(y)
        n_features = X.shape[1]
        n_classes = y.nunique()
        n_components = self.parameters["n_components"]
        if n_components is not None and n_components > min(n_classes, n_features):
            raise ValueError(f"n_components value {n_components} is too large")

        self._component_obj.fit(X, y)
        return self

[docs]    def transform(self, X, y=None):
        """Transform data using the fitted LDA component.

        Args:
            X (pd.DataFrame): The input training data of shape [n_samples, n_features].
            y (pd.Series, optional): The target training data of length [n_samples].

        Returns:
            pd.DataFrame: Transformed data.

        Raises:
            ValueError: If input data is not all numeric.
        """
        X_ww = infer_feature_types(X)
        if not is_all_numeric(X_ww):
            raise ValueError("LDA input must be all numeric")
        X_t = self._component_obj.transform(X)
        X_t = pd.DataFrame(
            X_t,
            index=X_ww.index,
            columns=[f"component_{i}" for i in range(X_t.shape[1])],
        )
        X_t.ww.init()
        return X_t

[docs]    def fit_transform(self, X, y=None):
        """Fit and transform data using the LDA component.

        Args:
            X (pd.DataFrame): The input training data of shape [n_samples, n_features].
            y (pd.Series, optional): The target training data of length [n_samples].

        Returns:
            pd.DataFrame: Transformed data.

        Raises:
            ValueError: If input data is not all numeric.
        """
        X_ww = infer_feature_types(X)
        if not is_all_numeric(X_ww):
            raise ValueError("LDA input must be all numeric")
        y = infer_feature_types(y)

        X_t = self._component_obj.fit_transform(X, y)
        X_t = pd.DataFrame(
            X_t,
            index=X_ww.index,
            columns=[f"component_{i}" for i in range(X_t.shape[1])],
        )
        X_t.ww.init()
        return X_t