Source code for evalml.pipelines.time_series_regression_pipeline
"""Pipeline base class for time series regression problems."""importnumpyasnpimportpandasaspdfromwoodwork.statistics_utilsimportinfer_frequencyfromevalml.model_familyimportModelFamilyfromevalml.pipelines.componentsimportSTLDecomposerfromevalml.pipelines.time_series_pipeline_baseimportTimeSeriesPipelineBasefromevalml.problem_typesimportProblemTypes,is_multiseriesfromevalml.utils.woodwork_utilsimportinfer_feature_types
[docs]classTimeSeriesRegressionPipeline(TimeSeriesPipelineBase):"""Pipeline base class for time series regression problems. Args: component_graph (ComponentGraph, list, dict): ComponentGraph instance, list of components in order, or dictionary of components. Accepts strings or ComponentBase subclasses in the list. Note that when duplicate components are specified in a list, the duplicate component names will be modified with the component's index in the list. For example, the component graph [Imputer, One Hot Encoder, Imputer, Logistic Regression Classifier] will have names ["Imputer", "One Hot Encoder", "Imputer_2", "Logistic Regression Classifier"] parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values. An empty dictionary {} implies using all default values for component parameters. Pipeline-level parameters such as time_index, gap, and max_delay must be specified with the "pipeline" key. For example: Pipeline(parameters={"pipeline": {"time_index": "Date", "max_delay": 4, "gap": 2}}). random_seed (int): Seed for the random number generator. Defaults to 0. Example: >>> pipeline = TimeSeriesRegressionPipeline(component_graph=["Simple Imputer", "Linear Regressor"], ... parameters={"Simple Imputer": {"impute_strategy": "mean"}, ... "pipeline": {"gap": 1, "max_delay": 1, "forecast_horizon": 1, "time_index": "date"}}, ... custom_name="My TimeSeriesRegression Pipeline") ... >>> assert pipeline.custom_name == "My TimeSeriesRegression Pipeline" >>> assert pipeline.component_graph.component_dict.keys() == {'Simple Imputer', 'Linear Regressor'} The pipeline parameters will be chosen from the default parameters for every component, unless specific parameters were passed in as they were above. >>> assert pipeline.parameters == { ... 'Simple Imputer': {'impute_strategy': 'mean', 'fill_value': None}, ... 'Linear Regressor': {'fit_intercept': True, 'n_jobs': -1}, ... 'pipeline': {'gap': 1, 'max_delay': 1, 'forecast_horizon': 1, 'time_index': "date"}} """problem_type=ProblemTypes.TIME_SERIES_REGRESSIONNO_PREDS_PI_ESTIMATORS=[ModelFamily.ARIMA,ModelFamily.EXPONENTIAL_SMOOTHING,ModelFamily.PROPHET,ModelFamily.VARMAX,]"""ProblemTypes.TIME_SERIES_REGRESSION"""
[docs]deffit(self,X,y):"""Fit a time series pipeline. Args: X (pd.DataFrame or np.ndarray): The input training data of shape [n_samples, n_features]. y (pd.Series, np.ndarray): The target training targets of length [n_samples]. Returns: self Raises: ValueError: If the target is not numeric. """X,y=self._convert_to_woodwork(X,y)self.frequency=infer_frequency(X[self.time_index])if"numeric"notiny.ww.semantic_tags:raiseValueError("Time Series Regression pipeline can only handle numeric target data!",)X,y=self._drop_time_index(X,y)self._fit(X,y)returnself
[docs]defscore(self,X,y,objectives,X_train=None,y_train=None):"""Evaluate model performance on current and additional objectives. Args: X (pd.DataFrame or np.ndarray): Data of shape [n_samples, n_features]. y (pd.Series): True labels of length [n_samples]. objectives (list): Non-empty list of objectives to score on. X_train (pd.DataFrame, np.ndarray): Data the pipeline was trained on of shape [n_samples_train, n_feautures]. y_train (pd.Series, np.ndarray): Targets used to train the pipeline of shape [n_samples_train]. Returns: dict: Ordered dictionary of objective scores. """X,y=self._convert_to_woodwork(X,y)X_train,y_train=self._convert_to_woodwork(X_train,y_train)objectives=self.create_objectives(objectives)y_predicted=self.predict_in_sample(X,y,X_train,y_train)returnself._score_all_objectives(X,y,y_predicted,y_pred_proba=None,objectives=objectives,y_train=y_train,)
[docs]defget_forecast_period(self,X):"""Generates all possible forecasting time points based on latest data point in X. Args: X (pd.DataFrame, np.ndarray): Data the pipeline was trained on of shape [n_samples_train, n_feautures]. Raises: ValueError: If pipeline is not trained. Returns: pd.Series: Datetime periods from `gap` to `forecast_horizon + gap`. Example: >>> X = pd.DataFrame({'date': pd.date_range(start='1-1-2022', periods=10, freq='D'), 'feature': range(10, 20)}) >>> y = pd.Series(range(0, 10), name='target') >>> gap = 1 >>> forecast_horizon = 2 >>> pipeline = TimeSeriesRegressionPipeline(component_graph=["Linear Regressor"], ... parameters={"Simple Imputer": {"impute_strategy": "mean"}, ... "pipeline": {"gap": gap, "max_delay": 1, "forecast_horizon": forecast_horizon, "time_index": "date"}}, ... ) >>> pipeline.fit(X, y) pipeline = TimeSeriesRegressionPipeline(component_graph={'Linear Regressor': ['Linear Regressor', 'X', 'y']}, parameters={'Linear Regressor':{'fit_intercept': True, 'n_jobs': -1}, 'pipeline':{'gap': 1, 'max_delay': 1, 'forecast_horizon': 2, 'time_index': 'date'}}, random_seed=0) >>> dates = pipeline.get_forecast_period(X) >>> expected = pd.Series(pd.date_range(start='2022-01-11', periods=forecast_horizon, freq='D').shift(gap), name='date', index=[10, 11]) >>> assert dates.equals(expected) """ifnotself._is_fitted:raiseValueError("Pipeline must be fitted before getting forecast.")X=infer_feature_types(X)# Generate prediction periodsfirst_date=X.iloc[-1][self.time_index]predicted_date_range=pd.Series(pd.date_range(start=first_date,periods=self.forecast_horizon+1,# Add additional period to account for dropping first date rowfreq=self.frequency,).shift(self.gap),)# Generate numerical indexfirst_idx=len(X)-1ifnotisinstance(X.index.dtype,int)elseX.index[-1]num_idx=pd.Series(range(first_idx,first_idx+predicted_date_range.size))predicted_date_range.index=num_idxpredicted_date_range=predicted_date_range.drop(predicted_date_range.index[0])predicted_date_range.name=self.time_indexreturnpredicted_date_range
[docs]defget_forecast_predictions(self,X,y):"""Generates all possible forecasting predictions based on last period of X. Args: X (pd.DataFrame, np.ndarray): Data the pipeline was trained on of shape [n_samples_train, n_feautures]. y (pd.Series, np.ndarray): Targets used to train the pipeline of shape [n_samples_train]. Returns: Predictions from `gap` periods out to `forecast_horizon + gap` periods. """X,y=self._convert_to_woodwork(X,y)pred_dates=pd.DataFrame(self.get_forecast_period(X))preds=self.predict(pred_dates,objective=None,X_train=X,y_train=y)returnpreds
[docs]defget_prediction_intervals(self,X,y=None,X_train=None,y_train=None,coverage=None,):"""Find the prediction intervals using the fitted regressor. This function takes the predictions of the fitted estimator and calculates the rolling standard deviation across all predictions using a window size of 5. The lower and upper predictions are determined by taking the percent point (quantile) function of the lower tail probability at each bound multiplied by the rolling standard deviation. Certain estimators (Extra Trees Estimator, XGBoost Estimator, Prophet Estimator, ARIMA, and Exponential Smoothing estimator) utilize a different methodology to calculate prediction intervals. See the docs for these estimators to learn more. Args: X (pd.DataFrame): Data of shape [n_samples, n_features]. y (pd.Series): Target data. X_train (pd.DataFrame, np.ndarray): Data the pipeline was trained on of shape [n_samples_train, n_features]. y_train (pd.Series, np.ndarray): Targets used to train the pipeline of shape [n_samples_train]. coverage (list[float]): A list of floats between the values 0 and 1 that the upper and lower bounds of the prediction interval should be calculated for. Returns: dict: Prediction intervals, keys are in the format {coverage}_lower or {coverage}_upper. Raises: MethodPropertyNotFoundError: If the estimator does not support Time Series Regression as a problem type. """has_stl=STLDecomposer.nameinlist(self.component_graph.component_instances.keys(),)ifcoverageisNone:coverage=[0.95]ifself.estimator.model_familyinself.NO_PREDS_PI_ESTIMATORSandhas_stl:def_get_series_intervals(intervals,residuals,trend_pred_intervals,y):return_intervals={}forkey,orig_pi_valuesinintervals.items():return_intervals[key]=pd.Series((orig_pi_values.values-residuals.values)+trend_pred_intervals[key].values+y.values,index=orig_pi_values.index,)returnreturn_intervalsifself.problem_type==ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION:fromevalml.pipelines.utilsimport(stack_data,unstack_multiseries,)X,y=unstack_multiseries(X,y,self.series_id,self.time_index,self.input_target_name,)X_no_datetime,y_no_datetime=self._drop_time_index(X,y)estimator_input=self.transform_all_but_final(X_no_datetime,y_no_datetime,X_train=X_train,y_train=y_train,)pred_intervals=self.estimator.get_prediction_intervals(X=estimator_input,y=y,coverage=coverage,)residuals=self.estimator.predict(estimator_input,)transformed_pred_intervals={}trend_pred_intervals=self.get_component("STL Decomposer",).get_trend_prediction_intervals(y,coverage=coverage)ifis_multiseries(self.problem_type):# Coverage label is label for each prediction interval limit(e.g. "0.95_lower")coverage_labels=list(list(pred_intervals.values())[0].keys())# Store prediction interval data in {coverage_label: {series_id: bound_value}}interval_series_pred_intervals={coverage_label:{}forcoverage_labelincoverage_labels}# `pred_intervals` are in {series_id: {coverage_label: bound_value}} formforseries_id,series_intervalsinpred_intervals.items():series_id_target_name=str(series_id)series_id_prediction_intervals=_get_series_intervals(series_intervals,residuals[series_id],trend_pred_intervals[series_id_target_name],y[series_id_target_name],)# Store `series_id_prediction_intervals` data in `interval_series_pred_intervals` formatfor(coverage_label,bound_value,)inseries_id_prediction_intervals.items():interval_series_pred_intervals[coverage_label][series_id_target_name]=bound_value# Stack bound data for each coverage label so each bound has a single pd.Seriesforcoverage_labelincoverage_labels:series_id_interval_df=pd.DataFrame(interval_series_pred_intervals[coverage_label],)stacked_pred_interval=stack_data(data=series_id_interval_df,series_id_name=self.series_id,)transformed_pred_intervals[coverage_label]=stacked_pred_intervalelse:transformed_pred_intervals=_get_series_intervals(pred_intervals,residuals,trend_pred_intervals,y,)returntransformed_pred_intervalselse:future_vals=self.predict(X=X,X_train=X_train,y_train=y_train,)predictions_train=self.predict_in_sample(X=X_train,y=y_train,X_train=X_train,y_train=y_train,calculating_residuals=True,)ifself.component_graph.has_dfs:predictions_train.index=y_train.indexresiduals=y_train-predictions_trainstd_residual=np.sqrt(np.sum(residuals**2)/len(residuals))res_dict={}cov_to_mult={0.75:1.15,0.85:1.44,0.95:1.96}forcovincoverage:lower=[]upper=[]multiplier=cov_to_mult[cov]forcounter,valinenumerate(future_vals):factor=multiplier*std_residual*np.sqrt(counter+1)lower.append(val-factor)upper.append(val+factor)res_dict[f"{cov}_lower"]=pd.Series(lower,name=f"{cov}_lower",index=future_vals.index,)res_dict[f"{cov}_upper"]=pd.Series(upper,name=f"{cov}_upper",index=future_vals.index,)returnres_dict