Source code for evalml.pipelines.components.estimators.regressors.varmax_regressor

"""Vector Autoregressive Moving Average with eXogenous regressors model. The two parameters (p, q) are the AR order and the MA order. More information here: https://www.statsmodels.org/stable/generated/statsmodels.tsa.statespace.varmax.VARMAX.html."""
from typing import Dict, Hashable, List, Optional, Union

import numpy as np
import pandas as pd
from skopt.space import Categorical, Integer
from sktime.forecasting.base import ForecastingHorizon

from evalml.model_family import ModelFamily
from evalml.pipelines.components.estimators import Estimator
from evalml.pipelines.components.utils import convert_bool_to_double, match_indices
from evalml.problem_types import ProblemTypes
from evalml.utils import (
    import_or_raise,
    infer_feature_types,
)


[docs]class VARMAXRegressor(Estimator):
    """Vector Autoregressive Moving Average with eXogenous regressors model. The two parameters (p, q) are the AR order and the MA order. More information here: https://www.statsmodels.org/stable/generated/statsmodels.tsa.statespace.varmax.VARMAX.html.

    Currently VARMAXRegressor isn't supported via conda install. It's recommended that it be installed via PyPI.

    Args:
        time_index (str): Specifies the name of the column in X that provides the datetime objects. Defaults to None.
        p (int): Maximum Autoregressive order. Defaults to 1.
        q (int): Maximum Moving Average order. Defaults to 0.
        trend (str): Controls the deterministic trend. Options are ['n', 'c', 't', 'ct'] where 'c' is a constant term,
            't' indicates a linear trend, and 'ct' is both. Can also be an iterable when defining a polynomial, such
            as [1, 1, 0, 1].
        random_seed (int): Seed for the random number generator. Defaults to 0.
        max_iter (int): Maximum number of iterations for solver. Defaults to 10.
        use_covariates (bool): If True, will pass exogenous variables in fit/predict methods. If False, forecasts will
            solely be based off of the datetimes and target values. Defaults to True.
    """

    _N_REPETITIONS = 400

    name = "VARMAX Regressor"
    hyperparameter_ranges = {
        "p": Integer(0, 10),
        "q": Integer(0, 10),
        "trend": Categorical(["n", "c", "t", "ct"]),
    }
    """{
        "p": Integer(1, 10),
        "q": Integer(1, 10),
        "trend": Categorical(['n', 'c', 't', 'ct']),
    }"""
    model_family = ModelFamily.VARMAX
    """ModelFamily.VARMAX"""
    supported_problem_types = [ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION]
    """[ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION]"""

    def __init__(
        self,
        time_index: Optional[Hashable] = None,
        p: int = 1,
        q: int = 0,
        trend: Optional[str] = "c",
        random_seed: Union[int, float] = 0,
        maxiter: int = 10,
        use_covariates: bool = False,
        **kwargs,
    ):
        self.preds_95_upper = None
        self.preds_95_lower = None
        parameters = {
            "order": (p, q),
            "trend": trend,
            "maxiter": maxiter,
        }
        parameters.update(kwargs)

        varmax_model_msg = (
            "sktime is not installed. Please install using `pip install sktime.`"
        )
        sktime_varmax = import_or_raise(
            "sktime.forecasting.varmax",
            error_msg=varmax_model_msg,
        )
        varmax_model = sktime_varmax.VARMAX(**parameters)

        parameters["use_covariates"] = use_covariates
        parameters["time_index"] = time_index
        parameters.update({"p": p, "q": q})

        self.use_covariates = use_covariates
        self.time_index = time_index

        super().__init__(
            parameters=parameters,
            component_obj=varmax_model,
            random_seed=random_seed,
        )

    def _set_forecast_horizon(self, X: pd.DataFrame):
        # we can only calculate the difference if the indices are of the same type
        units_diff = 1
        if isinstance(X.index[0], type(self.last_X_index)):
            if isinstance(
                X.index,
                pd.DatetimeIndex,
            ):
                dates_diff = pd.date_range(
                    start=self.last_X_index,
                    end=X.index[0],
                    freq=X.index.freq,
                )
                units_diff = len(dates_diff) - 1
            elif X.index.is_numeric():
                units_diff = X.index[0] - self.last_X_index
        fh_ = ForecastingHorizon(
            [units_diff + i for i in range(len(X))],
            is_relative=True,
        )
        return fh_

[docs]    def fit(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None):
        """Fits VARMAX regressor to data.

        Args:
            X (pd.DataFrame): The input training data of shape [n_samples, n_features].
            y (pd.DataFrane): The target training data of shape [n_samples, n_series_id_values].

        Returns:
            self

        Raises:
            ValueError: If y was not passed in.
        """
        X, y = self._manage_woodwork(X, y)

        if y is None:
            raise ValueError("VARMAX Regressor requires y as input.")
        y = convert_bool_to_double(y, include_ints=True)

        if X is not None and self.use_covariates:
            self.last_X_index = X.index[-1]
            X = X.ww.select(exclude=["Datetime"])

            X = convert_bool_to_double(X)
            X, y = match_indices(X, y)

            if not X.empty:
                self._component_obj.fit(y=y, X=X)
            else:
                self._component_obj.fit(y=y)
        else:
            self.last_X_index = y.index[-1]
            self._component_obj.fit(y=y)
        return self

    def _manage_types_and_forecast(self, X: pd.DataFrame) -> tuple:
        fh_ = self._set_forecast_horizon(X)
        X = X.ww.select(exclude=["Datetime"])
        X = convert_bool_to_double(X)
        return X, fh_

[docs]    def predict(self, X: pd.DataFrame, y: Optional[pd.DataFrame] = None) -> pd.Series:
        """Make predictions using fitted VARMAX regressor.

        Args:
            X (pd.DataFrame): Data of shape [n_samples, n_features].
            y (pd.DataFrame): Target data of shape [n_samples, n_series_id_values].

        Returns:
            pd.Series: Predicted values.

        Raises:
            ValueError: If X was passed to `fit` but not passed in `predict`.
        """
        X, y = self._manage_woodwork(X, y)
        X, fh_ = self._manage_types_and_forecast(X=X)

        if not X.empty and self.use_covariates:
            if fh_[0] != 1:
                # statsmodels (which sktime uses under the hood) only forecasts off the training data
                # but sktime circumvents this by predicting everything from the end of training data to the date / periods requested
                # and only returning the values for dates / periods given to sktime. Because of this,
                # pmdarima requires the number of covariate rows to equal the length of the total number of periods (X.shape[0] == fh_[-1]) if covariates are used.
                # We circument this by adding arbitrary rows to the start of X since sktime discards these values when predicting.
                num_rows_diff = fh_[-1] - X.shape[0]
                filler = pd.DataFrame(
                    columns=X.columns,
                    index=range(num_rows_diff),
                ).fillna(0)
                X_ = pd.concat([filler, X], ignore_index=True)
                X_.ww.init(schema=X.ww.schema)
            else:
                X_ = X
            y_pred = self._component_obj.predict(
                fh=fh_,
                X=X_,
            )
        else:
            y_pred = self._component_obj.predict(
                fh=fh_,
            )
        y_pred.index = X.index
        return infer_feature_types(y_pred)

[docs]    def get_prediction_intervals(
        self,
        X: pd.DataFrame,
        y: pd.DataFrame = None,
        coverage: List[float] = None,
        predictions: pd.Series = None,
    ) -> Dict[str, pd.Series]:
        """Find the prediction intervals using the fitted VARMAXRegressor.

        Args:
            X (pd.DataFrame): Data of shape [n_samples, n_features].
            y (pd.DataFrame): Target data of shape [n_samples, n_series_id_values]. Optional.
            coverage (list[float]): A list of floats between the values 0 and 1 that the upper and lower bounds of the
                prediction interval should be calculated for.
            predictions (pd.Series): Not used for VARMAX regressor.

        Returns:
            dict[dict]: A dict of prediction intervals, where the dict is in the format {series_id: {coverage}_lower or {coverage}_upper}.
        """
        if coverage is None:
            coverage = [0.95]

        X, y = self._manage_woodwork(X, y)
        use_exog = (
            # If exogenous variables were used during training
            self._component_obj._fitted_forecaster.model.exog is not None
            and self.use_covariates
        )
        if use_exog:
            X = X.ww.select(exclude=["Datetime"])
            X = convert_bool_to_double(X)
        # Accesses the fitted statsmodels model within sktime
        # nsimulations represents how many steps should be simulated
        # repetitions represents the number of simulations that should be run (confusing, I know)
        # anchor represents where the simulations should start from (forecasting is done from the "end")
        y_pred = self._component_obj._fitted_forecaster.simulate(
            nsimulations=X.shape[0],
            repetitions=self._N_REPETITIONS,
            anchor="end",
            random_state=self.random_seed,
            exog=X if use_exog else None,
        )
        prediction_interval_result = {}
        # Access the target column names (i.e. the series_id values) that the VARMAX component obj was fitted on
        for series in self._component_obj._fitted_forecaster.model.endog_names:
            series_result = {}
            series_preds = y_pred[[col for col in y_pred.columns if series in col]]
            for conf_int in coverage:
                prediction_interval_lower = series_preds.quantile(
                    q=round((1 - conf_int) / 2, 3),
                    axis="columns",
                )
                prediction_interval_upper = series_preds.quantile(
                    q=round((1 + conf_int) / 2, 3),
                    axis="columns",
                )
                prediction_interval_lower.index = X.index
                prediction_interval_upper.index = X.index
                series_result[f"{conf_int}_lower"] = prediction_interval_lower
                series_result[f"{conf_int}_upper"] = prediction_interval_upper
            prediction_interval_result[series] = series_result
        return prediction_interval_result

    @property
    def feature_importance(self) -> np.ndarray:
        """Returns array of 0's with a length of 1 as feature_importance is not defined for VARMAX regressor."""
        return np.zeros(1)