Source code for evalml.preprocessing.utils

"""Helpful preprocessing utilities."""

import pandas as pd
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit

from evalml.pipelines.utils import stack_data, stack_X, unstack_multiseries
from evalml.preprocessing.data_splitters import TrainingValidationSplit
from evalml.problem_types import (
    is_classification,
    is_multiseries,
    is_regression,
    is_time_series,
)
from evalml.utils import infer_feature_types


[docs]def load_data(path, index, target, n_rows=None, drop=None, verbose=True, **kwargs):
    """Load features and target from file.

    Args:
        path (str): Path to file or a http/ftp/s3 URL.
        index (str): Column for index.
        target (str): Column for target.
        n_rows (int): Number of rows to return. Defaults to None.
        drop (list): List of columns to drop. Defaults to None.
        verbose (bool): If True, prints information about features and target. Defaults to True.
        **kwargs: Other keyword arguments that should be passed to panda's `read_csv` method.

    Returns:
        pd.DataFrame, pd.Series: Features matrix and target.
    """
    feature_matrix = pd.read_csv(path, index_col=index, nrows=n_rows, **kwargs)

    targets = [target] + (drop or [])
    y = feature_matrix[target]
    X = feature_matrix.drop(columns=targets)

    if verbose:
        # number of features
        print(number_of_features(X.dtypes), end="\n\n")

        # number of total training examples
        info = "Number of training examples: {}"
        print(info.format(len(X)), end="\n")

        # target distribution
        print(target_distribution(y))

    return infer_feature_types(X), infer_feature_types(y)


[docs]def split_multiseries_data(X, y, series_id, time_index, **kwargs):
    """Split stacked multiseries data into train and test sets. Unstacked data can use `split_data`.

    Args:
        X (pd.DataFrame): The input training data of shape [n_samples*n_series, n_features].
        y (pd.Series): The target training targets of length [n_samples*n_series].
        series_id (str): Name of column containing series id.
        time_index (str): Name of column containing time index.
        **kwargs: Additional keyword arguments to pass to the split_data function.

    Returns:
        pd.DataFrame, pd.DataFrame, pd.Series, pd.Series: Feature and target data each split into train and test sets.
    """
    X_unstacked, y_unstacked = unstack_multiseries(
        X,
        y,
        series_id,
        time_index,
        y.name,
    )
    (
        X_train_unstacked,
        X_holdout_unstacked,
        y_train_unstacked,
        y_holdout_unstacked,
    ) = split_data(
        X_unstacked, y_unstacked, problem_type="time series regression", **kwargs
    )

    # Get unique series values (as a list to maintain order) from X if there is only the time_index column
    # Otherwise, this information is generated in `stack_X` from the column values
    series_id_values = X[series_id].unique() if len(X_unstacked.columns) == 1 else None

    X_train = stack_X(
        X_train_unstacked,
        series_id,
        time_index,
        series_id_values=series_id_values,
    )
    X_holdout = stack_X(
        X_holdout_unstacked,
        series_id,
        time_index,
        starting_index=X_train.index[-1] + 1,
        series_id_values=series_id_values,
    )
    y_train = stack_data(y_train_unstacked)
    y_holdout = stack_data(y_holdout_unstacked, starting_index=y_train.index[-1] + 1)

    return X_train, X_holdout, y_train, y_holdout


[docs]def split_data(
    X,
    y,
    problem_type,
    problem_configuration=None,
    test_size=None,
    random_seed=0,
):
    """Split data into train and test sets.

    Args:
        X (pd.DataFrame or np.ndarray): data of shape [n_samples, n_features]
        y (pd.Series, or np.ndarray): target data of length [n_samples]
        problem_type (str or ProblemTypes): type of supervised learning problem. see evalml.problem_types.problemtype.all_problem_types for a full list.
        problem_configuration (dict): Additional parameters needed to configure the search. For example,
            in time series problems, values should be passed in for the time_index, gap, and max_delay variables.
        test_size (float): What percentage of data points should be included in the test set. Defaults to 0.2 (20%) for non-timeseries problems and 0.1
            (10%) for timeseries problems.
        random_seed (int): Seed for the random number generator. Defaults to 0.

    Returns:
        pd.DataFrame, pd.DataFrame, pd.Series, pd.Series: Feature and target data each split into train and test sets.

    Raises:
        ValueError: If the problem_configuration is missing or does not contain both a time_index and series_id for multiseries problems.

    Examples:
        >>> X = pd.DataFrame([1, 2, 3, 4, 5, 6], columns=["First"])
        >>> y = pd.Series([8, 9, 10, 11, 12, 13])
        ...
        >>> X_train, X_validation, y_train, y_validation = split_data(X, y, "regression", random_seed=42)
        >>> X_train
           First
        5      6
        2      3
        4      5
        3      4
        >>> X_validation
           First
        0      1
        1      2
        >>> y_train
        5    13
        2    10
        4    12
        3    11
        dtype: int64
        >>> y_validation
        0    8
        1    9
        dtype: int64
    """
    if is_multiseries(problem_type) and isinstance(y, pd.Series):
        if problem_configuration is None:
            raise ValueError(
                "split_data requires problem_configuration for multiseries problems",
            )
        series_id = problem_configuration.get("series_id")
        time_index = problem_configuration.get("time_index")
        if series_id is None or time_index is None:
            raise ValueError(
                "split_data needs both series_id and time_index values in the problem_configuration to split multiseries data",
            )
        return split_multiseries_data(
            X,
            y,
            series_id,
            time_index,
            problem_configuration=problem_configuration,
            test_size=test_size,
            random_seed=random_seed,
        )

    X = infer_feature_types(X)
    y = infer_feature_types(y)

    data_splitter = None
    if is_time_series(problem_type):
        if test_size is None:
            test_size = 0.1
            if (
                problem_configuration is not None
                and "forecast_horizon" in problem_configuration
            ):
                fh_pct = problem_configuration["forecast_horizon"] / len(X)
                test_size = max(test_size, fh_pct)
        data_splitter = TrainingValidationSplit(
            test_size=test_size,
            shuffle=False,
            stratify=None,
            random_seed=random_seed,
        )
    else:
        if test_size is None:
            test_size = 0.2
        if is_regression(problem_type):
            data_splitter = ShuffleSplit(
                n_splits=1,
                test_size=test_size,
                random_state=random_seed,
            )
        elif is_classification(problem_type):
            data_splitter = StratifiedShuffleSplit(
                n_splits=1,
                test_size=test_size,
                random_state=random_seed,
            )

    train, test = next(data_splitter.split(X, y))

    X_train = X.ww.iloc[train]
    X_test = X.ww.iloc[test]
    y_train = y.ww.iloc[train]
    y_test = y.ww.iloc[test]

    return X_train, X_test, y_train, y_test


[docs]def number_of_features(dtypes):
    """Get the number of features of each specific dtype in a DataFrame.

    Args:
        dtypes (pd.Series): DataFrame.dtypes to get the number of features for.

    Returns:
        pd.Series: dtypes and the number of features for each input type.

    Example:
        >>> X = pd.DataFrame()
        >>> X["integers"] = [i for i in range(10)]
        >>> X["floats"] = [float(i) for i in range(10)]
        >>> X["strings"] = [str(i) for i in range(10)]
        >>> X["booleans"] = [bool(i%2) for i in range(10)]

        Lists the number of columns corresponding to each dtype.

        >>> number_of_features(X.dtypes)
                     Number of Features
        Boolean                       1
        Categorical                   1
        Numeric                       2
    """
    dtype_to_vtype = {
        "bool": "Boolean",
        "int32": "Numeric",
        "int64": "Numeric",
        "float64": "Numeric",
        "object": "Categorical",
        "datetime64[ns]": "Datetime",
    }

    vtypes = dtypes.astype(str).map(dtype_to_vtype).value_counts()
    return vtypes.sort_index().to_frame("Number of Features")


[docs]def target_distribution(targets):
    """Get the target distributions.

    Args:
        targets (pd.Series): Target data.

    Returns:
        pd.Series: Target data and their frequency distribution as percentages.

    Examples:
        >>> y = pd.Series([1, 2, 4, 1, 3, 3, 1, 2])
        >>> print(target_distribution(y).to_string())
        Targets
        1    37.50%
        2    25.00%
        3    25.00%
        4    12.50%
        >>> y = pd.Series([True, False, False, False, True])
        >>> print(target_distribution(y).to_string())
        Targets
        False    60.00%
        True     40.00%
    """
    distribution = targets.value_counts() / len(targets)
    return distribution.mul(100).apply("{:.2f}%".format).rename_axis("Targets")