Source code for evalml.pipelines.components.estimators.regressors.arima_regressor

"""Autoregressive Integrated Moving Average Model. The three parameters (p, d, q) are the AR order, the degree of differencing, and the MA order. More information here: https://www.statsmodels.org/devel/generated/statsmodels.tsa.arima_model.ARIMA.html."""
import numpy as np
import pandas as pd
from skopt.space import Integer

from evalml.model_family import ModelFamily
from evalml.pipelines.components.estimators import Estimator
from evalml.problem_types import ProblemTypes
from evalml.utils import import_or_raise, infer_feature_types


[docs]class ARIMARegressor(Estimator):
    """Autoregressive Integrated Moving Average Model. The three parameters (p, d, q) are the AR order, the degree of differencing, and the MA order. More information here: https://www.statsmodels.org/devel/generated/statsmodels.tsa.arima_model.ARIMA.html.

    Currently ARIMARegressor isn't supported via conda install. It's recommended that it be installed via PyPI.

    Args:
        date_index (str): Specifies the name of the column in X that provides the datetime objects. Defaults to None.
        trend (str): Controls the deterministic trend. Options are ['n', 'c', 't', 'ct'] where 'c' is a constant term,
            't' indicates a linear trend, and 'ct' is both. Can also be an iterable when defining a polynomial, such
            as [1, 1, 0, 1].
        start_p (int): Minimum Autoregressive order. Defaults to 2.
        d (int): Minimum Differencing degree. Defaults to 0.
        start_q (int): Minimum Moving Average order. Defaults to 2.
        max_p (int): Maximum Autoregressive order. Defaults to 5.
        max_d (int): Maximum Differencing degree. Defaults to 2.
        max_q (int): Maximum Moving Average order. Defaults to 5.
        seasonal (boolean): Whether to fit a seasonal model to ARIMA. Defaults to True.
        n_jobs (int or None): Non-negative integer describing level of parallelism used for pipelines. Defaults to -1.
        random_seed (int): Seed for the random number generator. Defaults to 0.
    """

    name = "ARIMA Regressor"
    hyperparameter_ranges = {
        "start_p": Integer(1, 3),
        "d": Integer(0, 2),
        "start_q": Integer(1, 3),
        "max_p": Integer(3, 10),
        "max_d": Integer(2, 5),
        "max_q": Integer(3, 10),
        "seasonal": [True, False],
    }
    """{
        "start_p": Integer(1, 3),
        "d": Integer(0, 2),
        "start_q": Integer(1, 3),
        "max_p": Integer(3, 10),
        "max_d": Integer(2, 5),
        "max_q": Integer(3, 10),
        "seasonal": [True, False],
    }"""
    model_family = ModelFamily.ARIMA
    """ModelFamily.ARIMA"""
    supported_problem_types = [ProblemTypes.TIME_SERIES_REGRESSION]
    """[ProblemTypes.TIME_SERIES_REGRESSION]"""

    def __init__(
        self,
        date_index=None,
        trend=None,
        start_p=2,
        d=0,
        start_q=2,
        max_p=5,
        max_d=2,
        max_q=5,
        seasonal=True,
        n_jobs=-1,
        random_seed=0,
        **kwargs,
    ):
        parameters = {
            "trend": trend,
            "start_p": start_p,
            "d": d,
            "start_q": start_q,
            "max_p": max_p,
            "max_d": max_d,
            "max_q": max_q,
            "seasonal": seasonal,
            "n_jobs": n_jobs,
            "date_index": date_index,
        }

        parameters.update(kwargs)

        arima_model_msg = (
            "sktime is not installed. Please install using `pip install sktime.`"
        )
        sktime_arima = import_or_raise(
            "sktime.forecasting.arima", error_msg=arima_model_msg
        )
        arima_model = sktime_arima.AutoARIMA(**parameters)

        super().__init__(
            parameters=parameters, component_obj=arima_model, random_seed=random_seed
        )

    def _get_dates(self, X, y):
        date_col = None
        if y is not None:
            y_index_type = infer_feature_types(
                pd.Series(y.index)
            ).ww.logical_type.type_string
            if y_index_type == "datetime":
                date_col = y.index
        if X is not None:
            X_index_type = infer_feature_types(
                pd.Series(X.index)
            ).ww.logical_type.type_string
            if self.parameters["date_index"] in X.columns:
                date_col = X.pop(self.parameters["date_index"])
            elif X_index_type == "datetime":
                date_col = X.index
        if date_col is None:
            msg = (
                "ARIMA regressor requires input data X to have a datetime column specified by the 'date_index' parameter. "
                "If not it will look for the datetime column in the index of X or y."
            )
            raise ValueError(msg)
        return date_col, X

    def _match_indices(self, X, y, date_col):
        if X is not None:
            X = X.copy()
            X.index = date_col
        if y is not None:
            y = y.copy()
            y.index = date_col
        return X, y

    def _format_dates(self, dates, X, y, predict=False):
        if len(dates.shape) == 1:
            dates = pd.DataFrame(dates)
        if dates.shape[1] == 1:
            dates.set_index(dates.columns[0], drop=True, inplace=True)
            dates = pd.DatetimeIndex(dates.index)
        elif dates.shape[1] > 1:
            raise ValueError(
                f"The dates parameter should not consist of any additional data outside of the datetime information located in the index or in a column."
                f" Found {dates.shape[1]} columns."
            )
        freq = pd.infer_freq(dates)
        dates = pd.DatetimeIndex(dates, freq=freq)
        X, y = self._match_indices(X, y, dates)
        if predict:
            arima_model_msg = (
                "sktime is not installed. Please install using `pip install sktime.`"
            )
            forecasting_ = import_or_raise(
                "sktime.forecasting.base", error_msg=arima_model_msg
            )
            fh_ = forecasting_.ForecastingHorizon(
                [i + 1 for i in range(len(dates))], is_relative=True
            )
            return X, y, fh_
        else:
            return X, y, None

[docs]    def fit(self, X, y=None):
        """Fits ARIMA regressor to data.

        Args:
            X (pd.DataFrame): The input training data of shape [n_samples, n_features].
            y (pd.Series): The target training data of length [n_samples].

        Returns:
            self

        Raises:
            ValueError: If X was passed to `fit` but not passed in `predict`.
        """
        if y is None:
            raise ValueError("ARIMA Regressor requires y as input.")

        X, y = self._manage_woodwork(X, y)
        dates, X = self._get_dates(X, y)
        X, y, _ = self._format_dates(dates, X, y)
        if X is not None and not X.empty:
            X = X.select_dtypes(exclude=["datetime64"])
            self._component_obj.fit(y=y, X=X)
        else:
            self._component_obj.fit(y=y)
        return self

[docs]    def predict(self, X, y=None):
        """Make predictions using fitted ARIMA regressor.

        Args:
            X (pd.DataFrame): Data of shape [n_samples, n_features].
            y (pd.Series): Target data.

        Returns:
            pd.Series: Predicted values.

        Raises:
            ValueError: If X was passed to `fit` but not passed in `predict`.
        """
        X, y = self._manage_woodwork(X, y)
        dates, X = self._get_dates(X, y)
        X, y, fh_ = self._format_dates(dates, X, y, predict=True)
        if X is not None and not X.empty:
            X = X.select_dtypes(exclude=["datetime64"])
            y_pred = self._component_obj.predict(fh=fh_, X=X)
        else:
            try:
                y_pred = self._component_obj.predict(fh=fh_)
            except ValueError as ve:
                error = str(ve)
                if "When an ARIMA is fit with an X array" in error:
                    raise ValueError(
                        "If X was passed to the fit method of the ARIMARegressor, "
                        "then it must be passed to the predict method as well."
                    )
                else:
                    raise ve
        return infer_feature_types(y_pred)

    @property
    def feature_importance(self):
        """Returns array of 0's with a length of 1 as feature_importance is not defined for ARIMA regressor."""
        return np.zeros(1)