import numpy as np
import pandas as pd
import matplotlib as mpl
import statsmodels.tsa.api as tsa
import statsmodels.api as sm
import pandas_datareader.data as web
import seaborn as sns
from scipy import stats
from itertools import product
from tqdm import tqdm_notebook
import logging
%matplotlib inline
mpl.style.use('seaborn-talk')
logger = logging.getLogger(__name__)
logger.setLevel(logging.WARNING)
handler = logging.FileHandler('convergence_warnings.tmp.log')
handler.setLevel(logging.WARNING)
logger.addHandler(handler)
logging.captureWarnings(True)
def plot_acf(data, ax, nlags=50, pct=.95, **kwargs):
"""
This function plots the autocorrelation function of the data and the associated Barlett bands.
Under the null there is no dependence in the data and so the Barlett bands' width is constant.
Parameters
----------
data : pandas dataframe
ax : matplotlib axes
pct : float in [0,1]
nlags : positive int
The number of lags to plot
kwargs : parameters to be passed to the seaborn distplot function.
"""
ax.scatter(np.arange(1, nlags),sm.tsa.acf(x=data, unbiased=True, nlags=nlags-1)[1:])
pit_ci_bottom, pit_ci_top = stats.norm.interval(alpha=pct, scale=data.size**-.5, loc=0)
ax.fill_between(x=np.arange(1,nlags), y1=pit_ci_bottom, y2=pit_ci_top, **kwargs)
ax.axhline(0, color='black')
ax.xaxis.set_major_locator(mpl.ticker.MaxNLocator(interger=True))
ax.set_ylim([-1,1])
ax.set_xlim([0, nlags-1])
data = web.DataReader('USEPUINDXD', 'fred', start='1800')
data = pd.DataFrame(data.values, index=pd.DatetimeIndex(data.index, freq='D'), columns=['EPU'])
recession_dates = web.DataReader('USREC', 'fred', start='1800')
I want the recession dates and epu to have the same index.
recession_dates = recession_dates.resample('D').ffill().reindex(data.index).ffill()
I plot the data over time
fig1, ax1 = mpl.pyplot.subplots()
fig1.set_size_inches((12,6.75))
ax1.plot(data)
ax1.fill_between(recession_dates.index, 725 * recession_dates.values.ravel(), color='gray',alpha=0.25)
ax1.set_ylim([1,725])
ax1.xaxis.set_major_locator(mpl.dates.AutoDateLocator(interval_multiples=True))
fig1.savefig("../doc/figures/epu.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)
I plot a histogram. The black line is a Gaussian density fit to the data.
fig2, ax2 = mpl.pyplot.subplots()
fig2.set_size_inches((12,6.75))
sns.distplot(data, rug=True, ax=ax2, fit=stats.norm)
ax2.set_ylabel('Density')
fig2.savefig("../doc/figures/epu_hist.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)
data.describe()
stats.describe(data)
Again the data are always positive, fat-tailed (excess kurtosis > 0) and skewed. We should be able to make a more reasonable approximation by taking logs.
log_data = data.transform(np.log)
log_data.describe()
stats.describe(log_data)
Now we can see that the gaussian density fits the data reasonaly well. There is some small skewness and kurtosis, but it is not bad.
fig3, ax3 = mpl.pyplot.subplots()
fig3.set_size_inches((12,6.75))
sns.distplot(log_data, rug=True, ax=ax3, fit=stats.norm)
ax3.set_ylabel('Density')
fig3.savefig("../doc/figures/log_epu_hist.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)
fig4, ax4 = mpl.pyplot.subplots()
fig4.set_size_inches((12,6.75))
ax4.plot(log_data)
ax4.fill_between(recession_dates.index, 7.25 * recession_dates.values.ravel(), color='gray',alpha=0.5)
ax4.xaxis.set_major_locator(mpl.dates.AutoDateLocator(interval_multiples=True))
ax4.set_ylim([0.01, 7.25])
fig4.savefig("../doc/figures/log_epu.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)
As we can see the data are quite persistent but very far away from a random walk.
fig5, ax5 = mpl.pyplot.subplots()
fig5.set_size_inches((12,6.75))
plot_acf(log_data, ax=ax5, nlags=30, alpha=0.25, color='grey')
ax5.xaxis.set_major_locator(mpl.ticker.MaxNLocator(steps=[7]))
fig5.savefig("../doc/figures/log_epu_acorr.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)
Instead of the autocorrelations starting at essentially $1$ as they did in the previous example, here they start at about 0.50. Hence, the data are mean-reverting and not trending. Hence, I can skip figuring out the trend like we did before.
model1 = tsa.ARIMA(log_data, order=(1,0,0)).fit(method='css')
model1.summary2()
fig6, ax6 = mpl.pyplot.subplots()
fig6.set_size_inches((12,6.75))
plot_acf(model1.resid, ax=ax6, nlags=30, alpha=0.25, color='grey')
ax6.xaxis.set_major_locator(mpl.ticker.MaxNLocator(steps=[7]))
fig6.savefig("../doc/figures/epu_acorr_ar1.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)
model2 = tsa.SARIMAX(log_data, trend='c', order=(2,0,0)).fit()
model2.summary()
fig8, ax8 = mpl.pyplot.subplots()
fig8.set_size_inches((12,6.75))
plot_acf(model2.resid, ax=ax8, nlags=30, alpha=0.5, color='grey')
ax8.xaxis.set_major_locator(mpl.ticker.MaxNLocator(steps=[7]))
fig8.savefig("../doc/figures/epu_acorr_ar2.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)
model3 = tsa.SARIMAX(log_data, trend='c', order=(0,0,0), seasonal_order=(1,0,0, 7)).fit()
model3.summary()
fig7, ax7 = mpl.pyplot.subplots()
fig7.set_size_inches((12,6.75))
plot_acf(model3.resid, ax=ax7, nlags=30, alpha=0.5, color='grey')
ax7.xaxis.set_major_locator(mpl.ticker.MaxNLocator(steps=[7]))
fig7.savefig("../doc/figures/only_seasonality_acorr.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)
model4 = tsa.SARIMAX(log_data, trend='c', order=(1,0,0), seasonal_order=(1,0,0, 7)).fit()
model4.summary()
fig8, ax8 = mpl.pyplot.subplots()
fig8.set_size_inches((12,6.75))
plot_acf(model4.resid, ax=ax8, nlags=30, alpha=0.25, color='grey')
ax8.xaxis.set_major_locator(mpl.ticker.MaxNLocator(steps=[7]))
fig8.savefig("../doc/figures/ar1_plus_seasonality_acorr.tmp.pdf", bbox_inches="tight", pad_inches=0,
frameon=False)
model5 = tsa.SARIMAX(log_data.EPU, trend='c', order=(2,0,0), seasonal_order=(1,0,0, 7)).fit()
model5.summary()
fig9, ax9 = mpl.pyplot.subplots()
fig9.set_size_inches((12,6.75))
plot_acf(model5.resid, ax=ax9, nlags=30, alpha=0.25, color='grey')
ax9.xaxis.set_major_locator(mpl.ticker.MaxNLocator(steps=[7]))
model6 = tsa.SARIMAX(log_data, trend='c', order=(5,0,0), seasonal_order=(2,0,0, 7)).fit()
model6.summary()
fig9, ax9 = mpl.pyplot.subplots()
fig9.set_size_inches((12,6.75))
plot_acf(model6.resid, ax=ax9, nlags=30, alpha=0.25, color='grey')
ax9.xaxis.set_major_locator(mpl.ticker.MaxNLocator(steps=[7]))
fig9.savefig("../doc/figures/ar5_plus_seasonality_acorr.tmp.pdf", bbox_inches="tight", pad_inches=0,
frameon=False)
model6.summary()
fig10, ax10 = mpl.pyplot.subplots()
fig10.set_size_inches((12,6.75))
ax10.plot(log_data['EPU'], label='Data')
ax10.plot(model6.fittedvalues, label='Fitted Values', color='orange')
ax10.legend()
fig10.savefig("../doc/figures/ar5_plus_seasonality_fittedvalues.tmp.pdf", bbox_inches="tight", pad_inches=0,
frameon=False)
fig11, ax11 = mpl.pyplot.subplots()
fig10.set_size_inches((12,6.75))
ax11.plot(model6.resid, color='orange')
ax11.set_ylim(-6,6)
fig11.savefig("../doc/figures/ar5_plus_seasonality_residuals.tmp.pdf", bbox_inches="tight", pad_inches=0,
frameon=False)
In this section, I estimate the model for the combination of autoregessive and seasonal lags. I then use this to compute the AIC and BIC for each model, and see which on is the smallest. By doing this, we can choose the model with an appropriate level of parsimony in an appropriate way.
However, first I setup a logging file to redirect warnings to.
bic_results = dict({})
aic_results = dict({})
max_ar = 10
seasonal_lags = [4,2,1]
for ar_order, season_order in tqdm_notebook(product(range(1,max_ar+1), seasonal_lags),
total=max_ar*len(seasonal_lags)):
model = tsa.SARIMAX(log_data, trend='c', order=(ar_order,0,0),
seasonal_order=(season_order,0,0, 7)).fit()
bic_results[(ar_order, season_order)] = model.bic
aic_results[(ar_order, season_order)] = model.aic
I now compute the find the model with the smallest BIC.
sorted(bic_results, key=bic_results.get)[0]
I now do the same for the AIC.
sorted(aic_results, key=aic_results.get)[0]
model7 = tsa.SARIMAX(log_data, trend='c', order=(10,0,0), seasonal_order=(4,0,0, 7)).fit()
model7.summary()
fig10, ax10 = mpl.pyplot.subplots()
fig10.set_size_inches((12,6.75))
plot_acf(model7.resid, ax=ax10, nlags=30, alpha=0.25, color='grey')
ax10.xaxis.set_major_locator(mpl.ticker.MaxNLocator(steps=[7]))
fig10.savefig("../doc/figures/ar9_plus_seasonality_acorr.tmp.pdf", bbox_inches="tight", pad_inches=0,
frameon=False)
log_data['dayofweek'] = log_data.index.dayofweek
log_data['value'] = 1
pivoted_data = pd.pivot_table(log_data, index=[log_data.index, log_data['EPU']],
columns=['dayofweek'], values='value', fill_value=0).reset_index().set_index('DATE')
model5 = tsa.SARIMAX(log_data.EPU, exog=pivoted_data.iloc[:,1:], trend='n', order=(5,0,0)).fit()
model5.summary()
fig9, ax9 = mpl.pyplot.subplots()
fig9.set_size_inches((12,6.75))
plot_acf(model5.resid, ax=ax9, nlags=30, alpha=0.25, color='grey')
ax9.xaxis.set_major_locator(mpl.ticker.MaxNLocator(steps=[7]))
Copyright 2018 Paul Sangrey
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.