Source code for evalml.pipelines.components.estimators.regressors.arima_regressor
"""Autoregressive Integrated Moving Average Model. The three parameters (p, d, q) are the AR order, the degree of differencing, and the MA order. More information here: https://www.statsmodels.org/devel/generated/statsmodels.tsa.arima.model.ARIMA.html."""fromtypingimportDict,Hashable,List,Optional,Tuple,Unionimportnumpyasnpimportpandasaspdfrompandas.api.typesimportis_integer_dtypefromskopt.spaceimportIntegerfromsktime.forecasting.baseimportForecastingHorizonfromevalml.model_familyimportModelFamilyfromevalml.pipelines.components.estimatorsimportEstimatorfromevalml.pipelines.components.utilsimportconvert_bool_to_double,match_indicesfromevalml.problem_typesimportProblemTypesfromevalml.utilsimport(import_or_raise,infer_feature_types,)
[docs]classARIMARegressor(Estimator):"""Autoregressive Integrated Moving Average Model. The three parameters (p, d, q) are the AR order, the degree of differencing, and the MA order. More information here: https://www.statsmodels.org/devel/generated/statsmodels.tsa.arima.model.ARIMA.html. Currently ARIMARegressor isn't supported via conda install. It's recommended that it be installed via PyPI. Args: time_index (str): Specifies the name of the column in X that provides the datetime objects. Defaults to None. trend (str): Controls the deterministic trend. Options are ['n', 'c', 't', 'ct'] where 'c' is a constant term, 't' indicates a linear trend, and 'ct' is both. Can also be an iterable when defining a polynomial, such as [1, 1, 0, 1]. start_p (int): Minimum Autoregressive order. Defaults to 2. d (int): Minimum Differencing degree. Defaults to 0. start_q (int): Minimum Moving Average order. Defaults to 2. max_p (int): Maximum Autoregressive order. Defaults to 5. max_d (int): Maximum Differencing degree. Defaults to 2. max_q (int): Maximum Moving Average order. Defaults to 5. seasonal (boolean): Whether to fit a seasonal model to ARIMA. Defaults to True. sp (int or str): Period for seasonal differencing, specifically the number of periods in each season. If "detect", this model will automatically detect this parameter (given the time series is a standard frequency) and will fall back to 1 (no seasonality) if it cannot be detected. Defaults to 1. n_jobs (int or None): Non-negative integer describing level of parallelism used for pipelines. Defaults to -1. random_seed (int): Seed for the random number generator. Defaults to 0. """name="ARIMA Regressor"hyperparameter_ranges={"start_p":Integer(1,3),"d":Integer(0,2),"start_q":Integer(1,3),"max_p":Integer(3,10),"max_d":Integer(2,5),"max_q":Integer(3,10),"seasonal":[True,False],}"""{ "start_p": Integer(1, 3), "d": Integer(0, 2), "start_q": Integer(1, 3), "max_p": Integer(3, 10), "max_d": Integer(2, 5), "max_q": Integer(3, 10), "seasonal": [True, False], }"""model_family=ModelFamily.ARIMA"""ModelFamily.ARIMA"""supported_problem_types=[ProblemTypes.TIME_SERIES_REGRESSION]"""[ProblemTypes.TIME_SERIES_REGRESSION]"""max_rows=1000max_cols=7def__init__(self,time_index:Optional[Hashable]=None,trend:Optional[str]=None,start_p:int=2,d:int=0,start_q:int=2,max_p:int=5,max_d:int=2,max_q:int=5,seasonal:bool=True,sp:int=1,n_jobs:int=-1,random_seed:Union[int,float]=0,maxiter:int=10,use_covariates:bool=True,**kwargs,):self.preds_95_upper=Noneself.preds_95_lower=Noneparameters={"trend":trend,"start_p":start_p,"d":d,"start_q":start_q,"max_p":max_p,"max_d":max_d,"max_q":max_q,"seasonal":seasonal,"maxiter":maxiter,"n_jobs":n_jobs,}parameters.update(kwargs)arima_model_msg=("sktime is not installed. Please install using `pip install sktime.`")sktime_arima=import_or_raise("sktime.forecasting.arima",error_msg=arima_model_msg,)arima_model=sktime_arima.AutoARIMA(**parameters)parameters["use_covariates"]=use_covariatesparameters["time_index"]=time_indexself.sp=spself.use_covariates=use_covariatessuper().__init__(parameters=parameters,component_obj=arima_model,random_seed=random_seed,)def_remove_datetime(self,data:pd.DataFrame,features:bool=False,)->pd.DataFrame:ifdataisNone:returnNonedata_no_dt=data.ww.copy()ifisinstance(data_no_dt.index,(pd.DatetimeIndex,pd.PeriodIndex,pd.IntervalIndex),):data_no_dt=data_no_dt.ww.reset_index(drop=True)iffeatures:data_no_dt=data_no_dt.ww.select(exclude=["Datetime"])returndata_no_dtdef_set_forecast(self,X:pd.DataFrame):# we can only calculate the difference if the indices are of the same typeunits_diff=1ifisinstance(X.index[0],type(self.last_X_index))andisinstance(X.index,pd.DatetimeIndex,):dates_diff=pd.date_range(start=self.last_X_index,end=X.index[0],freq=X.index.freq,)units_diff=len(dates_diff)-1elifis_integer_dtype(type(X.index[0]))andis_integer_dtype(type(self.last_X_index),):units_diff=X.index[0]-self.last_X_indexfh_=ForecastingHorizon([units_diff+iforiinrange(len(X))],is_relative=True,)returnfh_def_get_sp(self,X:pd.DataFrame)->int:ifXisNone:return1freq_mappings={"D":7,"M":12,"Q":4,}time_index=self._parameters.get("time_index",None)sp=self.spifsp=="detect":inferred_freqs=X.ww.infer_temporal_frequencies()freq=inferred_freqs.get(time_index,None)sp=1iffreqisnotNone:sp=freq_mappings.get(freq[:1],1)returnsp
[docs]deffit(self,X:pd.DataFrame,y:Optional[pd.Series]=None):"""Fits ARIMA regressor to data. Args: X (pd.DataFrame): The input training data of shape [n_samples, n_features]. y (pd.Series): The target training data of length [n_samples]. Returns: self Raises: ValueError: If y was not passed in. """X,y=self._manage_woodwork(X,y)ifXisnotNone:X=X.ww.fillna(X.mean())ifyisNone:raiseValueError("ARIMA Regressor requires y as input.")sp=self._get_sp(X)self._component_obj.sp=spself.last_X_index=X.index[-1]ifXisnotNoneelsey.index[-1]X=self._remove_datetime(X,features=True)ifXisnotNone:X=convert_bool_to_double(X)y=self._remove_datetime(y)X,y=match_indices(X,y)ifXisnotNoneandnotX.emptyandself.use_covariates:self._component_obj.fit(y=y,X=X)else:self._component_obj.fit(y=y)returnself
[docs]defpredict(self,X:pd.DataFrame,y:Optional[pd.Series]=None)->pd.Series:"""Make predictions using fitted ARIMA regressor. Args: X (pd.DataFrame): Data of shape [n_samples, n_features]. y (pd.Series): Target data. Returns: pd.Series: Predicted values. Raises: ValueError: If X was passed to `fit` but not passed in `predict`. """X,y=self._manage_woodwork(X,y)X,fh_=self._manage_types_and_forecast(X=X)ifnotX.emptyandself.use_covariates:iffh_[0]!=1:# pmdarima (which sktime uses under the hood) only forecasts off the training data# but sktime circumvents this by predicting everything from the end of training data to the date / periods requested# and only returning the values for dates / periods given to sktime. Because of this,# pmdarima requires the number of covariate rows to equal the length of the total number of periods (X.shape[0] == fh_[-1]) if covariates are used.# We circument this by adding arbitrary rows to the start of X since sktime discards these values when predicting.num_rows_diff=fh_[-1]-X.shape[0]filler=pd.DataFrame(columns=X.columns,index=range(num_rows_diff),).fillna(0)X_=pd.concat([filler,X],ignore_index=True)X_.ww.init(schema=X.ww.schema)else:X_=Xy_pred_intervals=self._component_obj.predict_interval(fh=fh_,X=X_,coverage=[0.95],)else:y_pred_intervals=self._component_obj.predict_interval(fh=fh_,coverage=[0.95],)y_pred_intervals.index=X.index(self.preds_95_lower,self.preds_95_upper,)=ARIMARegressor._parse_prediction_intervals(y_pred_intervals,0.95)y_pred=pd.concat((self.preds_95_lower,self.preds_95_upper),axis=1).mean(axis=1,)returninfer_feature_types(y_pred)
[docs]defget_prediction_intervals(self,X:pd.DataFrame,y:pd.Series=None,coverage:List[float]=None,predictions:pd.Series=None,)->Dict[str,pd.Series]:"""Find the prediction intervals using the fitted ARIMARegressor. Args: X (pd.DataFrame): Data of shape [n_samples, n_features]. y (pd.Series): Target data. Optional. coverage (list[float]): A list of floats between the values 0 and 1 that the upper and lower bounds of the prediction interval should be calculated for. predictions (pd.Series): Not used for ARIMA regressor. Returns: dict: Prediction intervals, keys are in the format {coverage}_lower or {coverage}_upper. """ifcoverageisNone:coverage=[0.95]X,y=self._manage_woodwork(X,y)X,fh_=self._manage_types_and_forecast(X=X)prediction_interval_result={}ifnotX.emptyandself.use_covariates:y_pred_intervals=self._component_obj.predict_interval(fh=fh_,X=X,coverage=coverage,)else:y_pred_intervals=self._component_obj.predict_interval(fh=fh_,coverage=coverage,)y_pred_intervals.index=X.indexforconf_intincoverage:if(conf_int==0.95andself.preds_95_lowerisnotNoneandself.preds_95_upperisnotNone):prediction_interval_result[f"{conf_int}_lower"]=self.preds_95_lowerprediction_interval_result[f"{conf_int}_upper"]=self.preds_95_uppercontinuepreds_lower,preds_upper=ARIMARegressor._parse_prediction_intervals(y_pred_intervals,conf_int,)prediction_interval_result[f"{conf_int}_lower"]=preds_lowerprediction_interval_result[f"{conf_int}_upper"]=preds_upperreturnprediction_interval_result
@propertydeffeature_importance(self)->np.ndarray:"""Returns array of 0's with a length of 1 as feature_importance is not defined for ARIMA regressor."""returnnp.zeros(1)