import numpy as np
import pandas as pd
import matplotlib as mpl
import statsmodels.tsa.api as tsa
import statsmodels.api as sm
import pandas_datareader.data as web
import seaborn as sns
from scipy import stats
from itertools import product
from tqdm import tqdm_notebook
import logging

%matplotlib inline
mpl.style.use('seaborn-talk')

logger = logging.getLogger(__name__)
logger.setLevel(logging.WARNING)
handler = logging.FileHandler('convergence_warnings.tmp.log')
handler.setLevel(logging.WARNING)
logger.addHandler(handler)
logging.captureWarnings(True)

First I define a function to compute and plot autocorrelation functions.¶

def plot_acf(data, ax, nlags=50, pct=.95, **kwargs):
    """
    This function plots the autocorrelation function of the data and the associated Barlett bands. 

    Under the null there is no dependence in the data and so the Barlett bands' width is constant.

    Parameters
    ----------
    data : pandas dataframe
    ax : matplotlib axes
    pct : float in [0,1]
    nlags : positive int
        The number of lags to plot
    kwargs : parameters to be passed to the seaborn distplot function.
    """

    ax.scatter(np.arange(1, nlags),sm.tsa.acf(x=data, unbiased=True, nlags=nlags-1)[1:])
    pit_ci_bottom, pit_ci_top = stats.norm.interval(alpha=pct, scale=data.size**-.5, loc=0) 

    ax.fill_between(x=np.arange(1,nlags), y1=pit_ci_bottom, y2=pit_ci_top, **kwargs)
    ax.axhline(0, color='black')
    ax.xaxis.set_major_locator(mpl.ticker.MaxNLocator(interger=True))

    ax.set_ylim([-1,1])
    ax.set_xlim([0, nlags-1])

I start by downloading the data¶

data = web.DataReader('USEPUINDXD', 'fred', start='1800')
data = pd.DataFrame(data.values, index=pd.DatetimeIndex(data.index, freq='D'), columns=['EPU'])
recession_dates = web.DataReader('USREC', 'fred', start='1800')

I want the recession dates and epu to have the same index.

recession_dates = recession_dates.resample('D').ffill().reindex(data.index).ffill()

I plot the data over time

fig1, ax1 = mpl.pyplot.subplots()
fig1.set_size_inches((12,6.75))
ax1.plot(data)
ax1.fill_between(recession_dates.index, 725 * recession_dates.values.ravel(), color='gray',alpha=0.25)
ax1.set_ylim([1,725])
ax1.xaxis.set_major_locator(mpl.dates.AutoDateLocator(interval_multiples=True))
fig1.savefig("../doc/figures/epu.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)

I plot a histogram. The black line is a Gaussian density fit to the data.

fig2, ax2 = mpl.pyplot.subplots()
fig2.set_size_inches((12,6.75))
sns.distplot(data, rug=True, ax=ax2, fit=stats.norm)
ax2.set_ylabel('Density')
fig2.savefig("../doc/figures/epu_hist.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)

data.describe()

stats.describe(data)

DescribeResult(nobs=12275, minmax=(array([ 3.32]), array([ 719.07])), mean=array([ 101.07073483]), variance=array([ 4706.06419868]), skewness=array([ 1.84934784]), kurtosis=array([ 5.90866379]))

Again the data are always positive, fat-tailed (excess kurtosis > 0) and skewed. We should be able to make a more reasonable approximation by taking logs.

log_data = data.transform(np.log)

log_data.describe()

stats.describe(log_data)

DescribeResult(nobs=12275, minmax=(array([ 1.19996478]), array([ 6.57795871])), mean=array([ 4.40558507]), variance=array([ 0.44644779]), skewness=array([-0.30557096]), kurtosis=array([ 0.35031361]))

Now we can see that the gaussian density fits the data reasonaly well. There is some small skewness and kurtosis, but it is not bad.

fig3, ax3 = mpl.pyplot.subplots()
fig3.set_size_inches((12,6.75))
sns.distplot(log_data, rug=True, ax=ax3, fit=stats.norm)
ax3.set_ylabel('Density')
fig3.savefig("../doc/figures/log_epu_hist.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)

fig4, ax4 = mpl.pyplot.subplots()
fig4.set_size_inches((12,6.75))
ax4.plot(log_data)
ax4.fill_between(recession_dates.index,  7.25 * recession_dates.values.ravel(), color='gray',alpha=0.5)
ax4.xaxis.set_major_locator(mpl.dates.AutoDateLocator(interval_multiples=True))
ax4.set_ylim([0.01, 7.25])
fig4.savefig("../doc/figures/log_epu.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)

As we can see the data are quite persistent but very far away from a random walk.

fig5, ax5 = mpl.pyplot.subplots()
fig5.set_size_inches((12,6.75))
plot_acf(log_data, ax=ax5, nlags=30, alpha=0.25, color='grey')
ax5.xaxis.set_major_locator(mpl.ticker.MaxNLocator(steps=[7]))
fig5.savefig("../doc/figures/log_epu_acorr.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)

Autoregressive Models¶

Instead of the autocorrelations starting at essentially $1$ as they did in the previous example, here they start at about 0.50. Hence, the data are mean-reverting and not trending. Hence, I can skip figuring out the trend like we did before.

model1 = tsa.ARIMA(log_data, order=(1,0,0)).fit(method='css')

model1.summary2()

fig6, ax6 = mpl.pyplot.subplots()
fig6.set_size_inches((12,6.75))
plot_acf(model1.resid, ax=ax6, nlags=30, alpha=0.25, color='grey')
ax6.xaxis.set_major_locator(mpl.ticker.MaxNLocator(steps=[7]))
fig6.savefig("../doc/figures/epu_acorr_ar1.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)

model2 = tsa.SARIMAX(log_data, trend='c', order=(2,0,0)).fit()

model2.summary()

fig8, ax8 = mpl.pyplot.subplots()
fig8.set_size_inches((12,6.75))
plot_acf(model2.resid, ax=ax8, nlags=30, alpha=0.5, color='grey')
ax8.xaxis.set_major_locator(mpl.ticker.MaxNLocator(steps=[7]))
fig8.savefig("../doc/figures/epu_acorr_ar2.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)

Seasonality¶

model3 = tsa.SARIMAX(log_data, trend='c', order=(0,0,0), seasonal_order=(1,0,0, 7)).fit()

model3.summary()

fig7, ax7 = mpl.pyplot.subplots()
fig7.set_size_inches((12,6.75))
plot_acf(model3.resid, ax=ax7, nlags=30, alpha=0.5, color='grey')
ax7.xaxis.set_major_locator(mpl.ticker.MaxNLocator(steps=[7]))
fig7.savefig("../doc/figures/only_seasonality_acorr.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)

model4 = tsa.SARIMAX(log_data, trend='c', order=(1,0,0), seasonal_order=(1,0,0, 7)).fit()

model4.summary()

fig8, ax8 = mpl.pyplot.subplots()
fig8.set_size_inches((12,6.75))
plot_acf(model4.resid, ax=ax8, nlags=30, alpha=0.25, color='grey')
ax8.xaxis.set_major_locator(mpl.ticker.MaxNLocator(steps=[7]))
fig8.savefig("../doc/figures/ar1_plus_seasonality_acorr.tmp.pdf", bbox_inches="tight", pad_inches=0,
             frameon=False)

model5 = tsa.SARIMAX(log_data.EPU, trend='c', order=(2,0,0), seasonal_order=(1,0,0, 7)).fit()

model5.summary()

fig9, ax9 = mpl.pyplot.subplots()
fig9.set_size_inches((12,6.75))
plot_acf(model5.resid, ax=ax9, nlags=30, alpha=0.25, color='grey')
ax9.xaxis.set_major_locator(mpl.ticker.MaxNLocator(steps=[7]))

model6 = tsa.SARIMAX(log_data, trend='c', order=(5,0,0), seasonal_order=(2,0,0, 7)).fit()

model6.summary()

fig9, ax9 = mpl.pyplot.subplots()
fig9.set_size_inches((12,6.75))
plot_acf(model6.resid, ax=ax9, nlags=30, alpha=0.25, color='grey')
ax9.xaxis.set_major_locator(mpl.ticker.MaxNLocator(steps=[7]))
fig9.savefig("../doc/figures/ar5_plus_seasonality_acorr.tmp.pdf", bbox_inches="tight", pad_inches=0,
             frameon=False)

model6.summary()

fig10, ax10 = mpl.pyplot.subplots()
fig10.set_size_inches((12,6.75))
ax10.plot(log_data['EPU'], label='Data')
ax10.plot(model6.fittedvalues, label='Fitted Values', color='orange')
ax10.legend()
fig10.savefig("../doc/figures/ar5_plus_seasonality_fittedvalues.tmp.pdf", bbox_inches="tight", pad_inches=0,
             frameon=False)

fig11, ax11 = mpl.pyplot.subplots()
fig10.set_size_inches((12,6.75))
ax11.plot(model6.resid, color='orange')
ax11.set_ylim(-6,6)
fig11.savefig("../doc/figures/ar5_plus_seasonality_residuals.tmp.pdf", bbox_inches="tight", pad_inches=0,
             frameon=False)

Use AIC & BIC to Choose the Appropriate Number of Lags¶

In this section, I estimate the model for the combination of autoregessive and seasonal lags. I then use this to compute the AIC and BIC for each model, and see which on is the smallest. By doing this, we can choose the model with an appropriate level of parsimony in an appropriate way.

However, first I setup a logging file to redirect warnings to.

bic_results = dict({})
aic_results = dict({})

max_ar = 10
seasonal_lags = [4,2,1]

for ar_order, season_order in tqdm_notebook(product(range(1,max_ar+1), seasonal_lags),
                                           total=max_ar*len(seasonal_lags)):

    model = tsa.SARIMAX(log_data, trend='c', order=(ar_order,0,0), 
                            seasonal_order=(season_order,0,0, 7)).fit()
    bic_results[(ar_order, season_order)] = model.bic
    aic_results[(ar_order, season_order)] = model.aic

I now compute the find the model with the smallest BIC.

sorted(bic_results, key=bic_results.get)[0]

(6, 4)

I now do the same for the AIC.

sorted(aic_results, key=aic_results.get)[0]

(10, 4)

model7 = tsa.SARIMAX(log_data, trend='c', order=(10,0,0), seasonal_order=(4,0,0, 7)).fit()

model7.summary()

fig10, ax10 = mpl.pyplot.subplots()
fig10.set_size_inches((12,6.75))
plot_acf(model7.resid, ax=ax10, nlags=30, alpha=0.25, color='grey')
ax10.xaxis.set_major_locator(mpl.ticker.MaxNLocator(steps=[7]))
fig10.savefig("../doc/figures/ar9_plus_seasonality_acorr.tmp.pdf", bbox_inches="tight", pad_inches=0,
             frameon=False)

Use day of the week as a regressor¶

log_data['dayofweek'] = log_data.index.dayofweek

log_data['value'] = 1
pivoted_data = pd.pivot_table(log_data, index=[log_data.index, log_data['EPU']],
               columns=['dayofweek'], values='value', fill_value=0).reset_index().set_index('DATE')

model5 = tsa.SARIMAX(log_data.EPU, exog=pivoted_data.iloc[:,1:], trend='n', order=(5,0,0)).fit()

model5.summary()

fig9, ax9 = mpl.pyplot.subplots()
fig9.set_size_inches((12,6.75))
plot_acf(model5.resid, ax=ax9, nlags=30, alpha=0.25, color='grey')
ax9.xaxis.set_major_locator(mpl.ticker.MaxNLocator(steps=[7]))

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

	EPU
count	12275.000000
mean	101.070735
std	68.600759
min	3.320000
25%	53.935000
50%	83.710000
75%	129.765000
max	719.070000

	EPU
count	12275.000000
mean	4.405585
std	0.668167
min	1.199965
25%	3.987780
50%	4.427358
75%	4.865725
max	6.577959

Model:	ARMA	BIC:	21977.3574
Dependent Variable:	EPU	Log-Likelihood:	-10975.
Date:	2018-08-13 08:27	Scale:	1.0000
No. Observations:	12275	Method:	css
Df Model:	2	Sample:	01-02-1985
Df Residuals:	12272		08-10-2018
Converged:	1.0000	S.D. of innovations:	0.592
No. Iterations:	5.0000	HQIC:	21962.566
AIC:	21955.1116

	Coef.	Std.Err.	t	P>\|t\|	[0.025	0.975]
const	4.4056	0.0100	441.6571	0.0000	4.3860	4.4251
ar.L1.EPU	0.4646	0.0080	58.1283	0.0000	0.4489	0.4803

	Real	Imaginary	Modulus	Frequency
AR.1	2.1523	0.0000	2.1523	0.0000

Dep. Variable:	EPU	No. Observations:	12275
Model:	SARIMAX(2, 0, 0)	Log Likelihood	-10666.539
Date:	Mon, 13 Aug 2018	AIC	21341.079
Time:	08:27:42	BIC	21370.740
Sample:	01-01-1985	HQIC	21351.018
	- 08-10-2018
Covariance Type:	opg

	coef	std err	z	P>\|z\|	[0.025	0.975]
intercept	1.8365	0.040	46.438	0.000	1.759	1.914
ar.L1	0.3617	0.008	44.132	0.000	0.346	0.378
ar.L2	0.2215	0.009	25.862	0.000	0.205	0.238
sigma2	0.3328	0.004	88.879	0.000	0.325	0.340

Ljung-Box (Q):	3776.90	Jarque-Bera (JB):	535.21
Prob(Q):	0.00	Prob(JB):	0.00
Heteroskedasticity (H):	0.51	Skew:	-0.36
Prob(H) (two-sided):	0.00	Kurtosis:	3.72

Ljung-Box (Q):	8317.56	Jarque-Bera (JB):	587.51
Prob(Q):	0.00	Prob(JB):	0.00
Heteroskedasticity (H):	0.65	Skew:	-0.35
Prob(H) (two-sided):	0.00	Kurtosis:	3.80

Ljung-Box (Q):	4341.70	Jarque-Bera (JB):	645.06
Prob(Q):	0.00	Prob(JB):	0.00
Heteroskedasticity (H):	0.56	Skew:	-0.40
Prob(H) (two-sided):	0.00	Kurtosis:	3.79

Ljung-Box (Q):	2453.05	Jarque-Bera (JB):	652.43
Prob(Q):	0.00	Prob(JB):	0.00
Heteroskedasticity (H):	0.53	Skew:	-0.39
Prob(H) (two-sided):	0.00	Kurtosis:	3.81

Ljung-Box (Q):	701.58	Jarque-Bera (JB):	812.95
Prob(Q):	0.00	Prob(JB):	0.00
Heteroskedasticity (H):	0.52	Skew:	-0.38
Prob(H) (two-sided):	0.00	Kurtosis:	4.00

Ljung-Box (Q):	362.57	Jarque-Bera (JB):	871.42
Prob(Q):	0.00	Prob(JB):	0.00
Heteroskedasticity (H):	0.52	Skew:	-0.34
Prob(H) (two-sided):	0.00	Kurtosis:	4.11

Ljung-Box (Q):	656.25	Jarque-Bera (JB):	812.29
Prob(Q):	0.00	Prob(JB):	0.00
Heteroskedasticity (H):	0.53	Skew:	-0.29
Prob(H) (two-sided):	0.00	Kurtosis:	4.12

	coef	std err	z	P>\|z\|	[0.025	0.975]
intercept	2.4628	0.033	73.740	0.000	2.397	2.528
ar.S.L7	0.4410	0.008	57.934	0.000	0.426	0.456
sigma2	0.3597	0.004	89.927	0.000	0.352	0.368

	coef	std err	z	P>\|z\|	[0.025	0.975]
intercept	2.0119	0.031	65.042	0.000	1.951	2.072
ar.L1	0.3472	0.008	43.269	0.000	0.332	0.363
ar.S.L7	0.3005	0.008	37.224	0.000	0.285	0.316
sigma2	0.3241	0.004	89.542	0.000	0.317	0.331

	coef	std err	z	P>\|z\|	[0.025	0.975]
intercept	1.6994	0.034	49.490	0.000	1.632	1.767
ar.L1	0.2994	0.008	35.936	0.000	0.283	0.316
ar.L2	0.1806	0.009	20.851	0.000	0.164	0.198
ar.S.L7	0.2582	0.008	31.699	0.000	0.242	0.274
sigma2	0.3140	0.003	90.065	0.000	0.307	0.321

	coef	std err	z	P>\|z\|	[0.025	0.975]
intercept	1.1746	0.037	31.515	0.000	1.102	1.248
ar.L1	0.2482	0.008	29.704	0.000	0.232	0.265
ar.L2	0.1161	0.009	13.015	0.000	0.099	0.134
ar.L3	0.0852	0.009	9.814	0.000	0.068	0.102
ar.L4	0.0542	0.009	6.198	0.000	0.037	0.071
ar.L5	0.0854	0.009	9.901	0.000	0.068	0.102
ar.S.L7	0.1834	0.008	22.109	0.000	0.167	0.200
ar.S.L14	0.1678	0.008	20.339	0.000	0.152	0.184
sigma2	0.2957	0.003	92.922	0.000	0.289	0.302