Source code for evalml.pipelines.components.estimators.regressors.varmax_regressor
"""Vector Autoregressive Moving Average with eXogenous regressors model. The two parameters (p, q) are the AR order and the MA order. More information here: https://www.statsmodels.org/stable/generated/statsmodels.tsa.statespace.varmax.VARMAX.html."""fromtypingimportDict,Hashable,List,Optional,Unionimportnumpyasnpimportpandasaspdfromskopt.spaceimportCategorical,Integerfromsktime.forecasting.baseimportForecastingHorizonfromevalml.model_familyimportModelFamilyfromevalml.pipelines.components.estimatorsimportEstimatorfromevalml.pipelines.components.utilsimportconvert_bool_to_double,match_indicesfromevalml.problem_typesimportProblemTypesfromevalml.utilsimport(import_or_raise,infer_feature_types,)
[docs]classVARMAXRegressor(Estimator):"""Vector Autoregressive Moving Average with eXogenous regressors model. The two parameters (p, q) are the AR order and the MA order. More information here: https://www.statsmodels.org/stable/generated/statsmodels.tsa.statespace.varmax.VARMAX.html. Currently VARMAXRegressor isn't supported via conda install. It's recommended that it be installed via PyPI. Args: time_index (str): Specifies the name of the column in X that provides the datetime objects. Defaults to None. p (int): Maximum Autoregressive order. Defaults to 1. q (int): Maximum Moving Average order. Defaults to 0. trend (str): Controls the deterministic trend. Options are ['n', 'c', 't', 'ct'] where 'c' is a constant term, 't' indicates a linear trend, and 'ct' is both. Can also be an iterable when defining a polynomial, such as [1, 1, 0, 1]. random_seed (int): Seed for the random number generator. Defaults to 0. max_iter (int): Maximum number of iterations for solver. Defaults to 10. use_covariates (bool): If True, will pass exogenous variables in fit/predict methods. If False, forecasts will solely be based off of the datetimes and target values. Defaults to True. """_N_REPETITIONS=400name="VARMAX Regressor"hyperparameter_ranges={"p":Integer(0,10),"q":Integer(0,10),"trend":Categorical(["n","c","t","ct"]),}"""{ "p": Integer(1, 10), "q": Integer(1, 10), "trend": Categorical(['n', 'c', 't', 'ct']), }"""model_family=ModelFamily.VARMAX"""ModelFamily.VARMAX"""supported_problem_types=[ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION]"""[ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION]"""def__init__(self,time_index:Optional[Hashable]=None,p:int=1,q:int=0,trend:Optional[str]="c",random_seed:Union[int,float]=0,maxiter:int=10,use_covariates:bool=False,**kwargs,):self.preds_95_upper=Noneself.preds_95_lower=Noneparameters={"order":(p,q),"trend":trend,"maxiter":maxiter,}parameters.update(kwargs)varmax_model_msg=("sktime is not installed. Please install using `pip install sktime.`")sktime_varmax=import_or_raise("sktime.forecasting.varmax",error_msg=varmax_model_msg,)varmax_model=sktime_varmax.VARMAX(**parameters)parameters["use_covariates"]=use_covariatesparameters["time_index"]=time_indexparameters.update({"p":p,"q":q})self.use_covariates=use_covariatesself.time_index=time_indexsuper().__init__(parameters=parameters,component_obj=varmax_model,random_seed=random_seed,)def_set_forecast_horizon(self,X:pd.DataFrame):# we can only calculate the difference if the indices are of the same typeunits_diff=1ifisinstance(X.index[0],type(self.last_X_index)):ifisinstance(X.index,pd.DatetimeIndex,):dates_diff=pd.date_range(start=self.last_X_index,end=X.index[0],freq=X.index.freq,)units_diff=len(dates_diff)-1elifX.index.is_numeric():units_diff=X.index[0]-self.last_X_indexfh_=ForecastingHorizon([units_diff+iforiinrange(len(X))],is_relative=True,)returnfh_
[docs]deffit(self,X:pd.DataFrame,y:Optional[pd.DataFrame]=None):"""Fits VARMAX regressor to data. Args: X (pd.DataFrame): The input training data of shape [n_samples, n_features]. y (pd.DataFrane): The target training data of shape [n_samples, n_series_id_values]. Returns: self Raises: ValueError: If y was not passed in. """X,y=self._manage_woodwork(X,y)ifyisNone:raiseValueError("VARMAX Regressor requires y as input.")y=convert_bool_to_double(y,include_ints=True)ifXisnotNoneandself.use_covariates:self.last_X_index=X.index[-1]X=X.ww.select(exclude=["Datetime"])X=convert_bool_to_double(X)X,y=match_indices(X,y)ifnotX.empty:self._component_obj.fit(y=y,X=X)else:self._component_obj.fit(y=y)else:self.last_X_index=y.index[-1]self._component_obj.fit(y=y)returnself
[docs]defpredict(self,X:pd.DataFrame,y:Optional[pd.DataFrame]=None)->pd.Series:"""Make predictions using fitted VARMAX regressor. Args: X (pd.DataFrame): Data of shape [n_samples, n_features]. y (pd.DataFrame): Target data of shape [n_samples, n_series_id_values]. Returns: pd.Series: Predicted values. Raises: ValueError: If X was passed to `fit` but not passed in `predict`. """X,y=self._manage_woodwork(X,y)X,fh_=self._manage_types_and_forecast(X=X)ifnotX.emptyandself.use_covariates:iffh_[0]!=1:# statsmodels (which sktime uses under the hood) only forecasts off the training data# but sktime circumvents this by predicting everything from the end of training data to the date / periods requested# and only returning the values for dates / periods given to sktime. Because of this,# pmdarima requires the number of covariate rows to equal the length of the total number of periods (X.shape[0] == fh_[-1]) if covariates are used.# We circument this by adding arbitrary rows to the start of X since sktime discards these values when predicting.num_rows_diff=fh_[-1]-X.shape[0]filler=pd.DataFrame(columns=X.columns,index=range(num_rows_diff),).fillna(0)X_=pd.concat([filler,X],ignore_index=True)X_.ww.init(schema=X.ww.schema)else:X_=Xy_pred=self._component_obj.predict(fh=fh_,X=X_,)else:y_pred=self._component_obj.predict(fh=fh_,)y_pred.index=X.indexreturninfer_feature_types(y_pred)
[docs]defget_prediction_intervals(self,X:pd.DataFrame,y:pd.DataFrame=None,coverage:List[float]=None,predictions:pd.Series=None,)->Dict[str,pd.Series]:"""Find the prediction intervals using the fitted VARMAXRegressor. Args: X (pd.DataFrame): Data of shape [n_samples, n_features]. y (pd.DataFrame): Target data of shape [n_samples, n_series_id_values]. Optional. coverage (list[float]): A list of floats between the values 0 and 1 that the upper and lower bounds of the prediction interval should be calculated for. predictions (pd.Series): Not used for VARMAX regressor. Returns: dict[dict]: A dict of prediction intervals, where the dict is in the format {series_id: {coverage}_lower or {coverage}_upper}. """ifcoverageisNone:coverage=[0.95]X,y=self._manage_woodwork(X,y)use_exog=(# If exogenous variables were used during trainingself._component_obj._fitted_forecaster.model.exogisnotNoneandself.use_covariates)ifuse_exog:X=X.ww.select(exclude=["Datetime"])X=convert_bool_to_double(X)# Accesses the fitted statsmodels model within sktime# nsimulations represents how many steps should be simulated# repetitions represents the number of simulations that should be run (confusing, I know)# anchor represents where the simulations should start from (forecasting is done from the "end")y_pred=self._component_obj._fitted_forecaster.simulate(nsimulations=X.shape[0],repetitions=self._N_REPETITIONS,anchor="end",random_state=self.random_seed,exog=Xifuse_exogelseNone,)prediction_interval_result={}# Access the target column names (i.e. the series_id values) that the VARMAX component obj was fitted onforseriesinself._component_obj._fitted_forecaster.model.endog_names:series_result={}series_preds=y_pred[[colforcoliny_pred.columnsifseriesincol]]forconf_intincoverage:prediction_interval_lower=series_preds.quantile(q=round((1-conf_int)/2,3),axis="columns",)prediction_interval_upper=series_preds.quantile(q=round((1+conf_int)/2,3),axis="columns",)prediction_interval_lower.index=X.indexprediction_interval_upper.index=X.indexseries_result[f"{conf_int}_lower"]=prediction_interval_lowerseries_result[f"{conf_int}_upper"]=prediction_interval_upperprediction_interval_result[series]=series_resultreturnprediction_interval_result
@propertydeffeature_importance(self)->np.ndarray:"""Returns array of 0's with a length of 1 as feature_importance is not defined for VARMAX regressor."""returnnp.zeros(1)