Source code for evalml.pipelines.components.estimators.regressors.arima_regressor

import numpy as np
import pandas as pd
from skopt.space import Integer

from evalml.model_family import ModelFamily
from evalml.pipelines.components.estimators import Estimator
from evalml.problem_types import ProblemTypes
from evalml.utils import import_or_raise, infer_feature_types


[docs]class ARIMARegressor(Estimator):
    """
    Autoregressive Integrated Moving Average Model.
    The three parameters (p, d, q) are the AR order, the degree of differencing, and the MA order.
    More information here: https://www.statsmodels.org/devel/generated/statsmodels.tsa.arima_model.ARIMA.html

    Currently ARIMARegressor isn't supported via conda install. It's recommended that it be installed via PyPI.

    """
    name = "ARIMA Regressor"
    hyperparameter_ranges = {
        "start_p": Integer(1, 3),
        "d": Integer(0, 2),
        "start_q": Integer(1, 3),
        "max_p": Integer(3, 10),
        "max_d": Integer(2, 5),
        "max_q": Integer(3, 10),
        "seasonal": [True, False]
    }
    model_family = ModelFamily.ARIMA
    supported_problem_types = [ProblemTypes.TIME_SERIES_REGRESSION]

[docs]    def __init__(self, date_index=None, trend=None, start_p=2, d=0, start_q=2, max_p=5, max_d=2, max_q=5, seasonal=True,
                 n_jobs=-1, random_seed=0, **kwargs):
        """
        Arguments:
            date_index (str): Specifies the name of the column in X that provides the datetime objects. Defaults to None.
            trend (str): Controls the deterministic trend. Options are ['n', 'c', 't', 'ct'] where 'c' is a constant term,
                't' indicates a linear trend, and 'ct' is both. Can also be an iterable when defining a polynomial, such
                as [1, 1, 0, 1].
            start_p (int): Minimum Autoregressive order.
            d (int): Minimum Differencing degree.
            start_q (int): Minimum Moving Average order.
            max_p (int): Maximum Autoregressive order.
            max_d (int): Maximum Differencing degree.
            max_q (int): Maximum Moving Average order.
            seasonal (bool): Whether to fit a seasonal model to ARIMA.
        """

        parameters = {'trend': trend,
                      'start_p': start_p,
                      'd': d,
                      'start_q': start_q,
                      'max_p': max_p,
                      'max_d': max_d,
                      'max_q': max_q,
                      'seasonal': seasonal,
                      "n_jobs": n_jobs,
                      "date_index": date_index}

        parameters.update(kwargs)

        arima_model_msg = "sktime is not installed. Please install using `pip install sktime.`"
        sktime_arima = import_or_raise("sktime.forecasting.arima", error_msg=arima_model_msg)
        arima_model = sktime_arima.AutoARIMA(**parameters)

        super().__init__(parameters=parameters,
                         component_obj=arima_model,
                         random_seed=random_seed)

    def _get_dates(self, X, y):
        date_col = None
        if y is not None:
            y_index_type = infer_feature_types(pd.Series(y.index)).logical_type.type_string
            if y_index_type == 'datetime':
                date_col = y.index
        if X is not None:
            X_index_type = infer_feature_types(pd.Series(X.index)).logical_type.type_string
            if self.parameters['date_index'] in X.columns:
                date_col = X.pop(self.parameters['date_index'])
            elif X_index_type == 'datetime':
                date_col = X.index
        if date_col is None:
            msg = "ARIMA regressor requires input data X to have a datetime column specified by the 'date_index' parameter. " \
                  "If not it will look for the datetime column in the index of X or y."
            raise ValueError(msg)
        return date_col, X

    def _match_indices(self, X, y, date_col):
        if X is not None:
            X.index = date_col
        if y is not None:
            y.index = date_col
        return X, y

    def _format_dates(self, dates, X, y, predict=False):
        if len(dates.shape) == 1:
            dates = pd.DataFrame(dates)
        if dates.shape[1] == 1:
            dates.set_index(dates.columns[0], drop=True, inplace=True)
            dates = pd.DatetimeIndex(dates.index)
        elif dates.shape[1] > 1:
            raise ValueError(f"The dates parameter should not consist of any additional data outside of the datetime information located in the index or in a column."
                             f" Found {dates.shape[1]} columns.")
        freq = 'M' if pd.infer_freq(dates) == 'MS' else pd.infer_freq(dates)
        dates = dates.to_period(freq=freq)
        X, y = self._match_indices(X, y, dates)
        if predict:
            arima_model_msg = "sktime is not installed. Please install using `pip install sktime.`"
            forecasting_ = import_or_raise("sktime.forecasting.base", error_msg=arima_model_msg)
            fh_ = forecasting_.ForecastingHorizon(dates, is_relative=False)
            return X, y, fh_
        else:
            return X, y, None

[docs]    def fit(self, X, y=None):
        if y is None:
            raise ValueError('ARIMA Regressor requires y as input.')

        X, y = self._manage_woodwork(X, y)
        dates, X = self._get_dates(X, y)
        X, y, _ = self._format_dates(dates, X, y)
        if X is not None and not X.empty:
            X = X.select_dtypes(exclude=['datetime64'])
            self._component_obj.fit(y=y, X=X)
        else:
            self._component_obj.fit(y=y)
        return self

[docs]    def predict(self, X, y=None):
        X, y = self._manage_woodwork(X, y)
        dates, X = self._get_dates(X, y)
        X, y, fh_ = self._format_dates(dates, X, y, predict=True)
        if X is not None and not X.empty:
            X = X.select_dtypes(exclude=['datetime64'])
            y_pred = self._component_obj.predict(fh=fh_, X=X)
        else:
            try:
                y_pred = self._component_obj.predict(fh=fh_)
            except ValueError as ve:
                error = str(ve)
                if "When an ARIMA is fit with an X array" in error:
                    raise ValueError("If X was passed to the fit method of the ARIMARegressor, "
                                     "then it must be passed to the predict method as well.")
                else:
                    raise ve
        return infer_feature_types(y_pred)

    @property
    def feature_importance(self):
        """
        Returns array of 0's with a length of 1 as feature_importance is not defined for ARIMA regressor.
        """
        return np.zeros(1)