I pretty sure I am doing something wrong, but I have no idea what it is.
I have weekly data from 2018 - 09/2022 and I am trying to forecast the last 13 weeks of 2022 out of sample using the last 13 weeks of 2021 as the exog variables for prediction.
The SARIMA_SEL_K function works really well in sample I.e., it performs well on the test set after training. However when I try to do out of sample prediction the forecasts looks very odd and does not capture the end of year decline that is inherent in the series seasonality.
I am using Fourier terms, with k-terms selected by minimizing AIC through auto_arima... either there is a fatal flaw in my understanding of this process or an error in my code. I cannot work out which one it is.
Question: Does it make sense to use historical data for exog Fourier terms? The rationale for doing this is a sharp decrease in values towards the end of the year. In my mind, using last year's data will allow the forecast to mimic this end-of-year behaviour while allowing the ARIMA components to take care of the short-term dynamics idiosyncratic to the current year.
Does this make conceptual sense as a forecasting strategy?
data['ts'] = pd.date_range('2017/12/29' ,'2022/12/23',freq='W-FRI')
data= data.set_index(pd.DatetimeIndex(data['ts'], freq = 'W-FRI'))
def fourier_transform(f, k , data):
four_terms = FourierFeaturizer(f, k)
y_prime, exog = four_terms.fit_transform(data)
exog['date'] = y_prime.index
exog = exog.set_index(exog['date'])
exog.index.freq = 'W-FRI'
exog = exog.drop(columns=['date'])
return (y_prime, exog)
def SARIMA_SEL_K(m, range_k, data, n):
if m > 1:
score_best = np.inf
best_model = 0
len_k = 0
for k in range_k:
y_prime, exog = fourier_transform(m, k, data)
y_to_train = y_prime.iloc[:(len(y_prime)-n)]
y_to_test = y_prime.iloc[(len(y_prime)-n):] # last year for testing
exog_to_train = exog.iloc[:(len(exog)-n)]
exog_to_test = exog.iloc[(len(exog)-n):]
model_auto_arima = pm.auto_arima(y_to_train, exog_to_train, n_fits = 15
#out_of_sample_size = n, scoring = 'mse',
)
order = model_auto_arima.get_params().get('order')
new_model = model_auto_arima
## test
preds = model_auto_arima.predict(len(y_to_test), exog_to_test)
abs_error = abs(y_to_test.values - preds)
N = 1/len(y_to_test)
score_value = N*np.sum(abs_error)
print('$k = ' + str(k) + ': ' + str(score_value) + str(order))
if score_value < score_best:
score_best = score_value
best_model = new_model
len_k = k
else:
pass
print(len_k)
else:
best_model = pm.auto_arima(data, n_fits = 15 )
return(best_model)
forecast_n = 13
model = SARIMA_SEL_K(52, [k for k in range(1 , 26, 1 )] ,data, forecast_n)
test_y, test_exog = fourier_transform(52, 9, data[-forecast_n:])
model_refit = model.update(test_y, test_exog)
pred_y, pred_exog = fourier_transform(52, 9, data_pred)
fc,ci = model.predict(forecast_n, pred_exog, return_conf_int = True, start='2022-09-30' , end='2022-12-23')
plt.figure(figsize=(12,5), dpi=100)
plt.plot(data.loc['2022-01-07':'2022-12-23'].index, data.loc['2022-01-07':'2022-12-23'], label='Actual')
plt.plot(fc, label = 'Forecast' )