import numpy as np
import pandas as pd
import matplotlib as mpl
import statsmodels.tsa.api as tsa
import statsmodels.api as sm
import pandas_datareader.data as web
import seaborn as sns
from scipy import stats
%matplotlib inline
mpl.style.use('seaborn-talk')
def plot_acf(data, ax, nlags=50, pct=.95, **kwargs):
"""
This function plots the autocorrelation function of the data and the associated Barlett bands.
Under the null there is no dependence in the data and so the Barlett bands' width is constant.
Parameters
----------
data : pandas dataframe
ax : matplotlib axes
pct : float in [0,1]
nlags : positive int
The number of lags to plot
kwargs : parameters to be passed to the seaborn distplot function.
"""
ax.scatter(np.arange(1, nlags),sm.tsa.acf(x=data, unbiased=True, nlags=nlags-1)[1:])
pit_ci_bottom, pit_ci_top = stats.norm.interval(alpha=pct, scale=data.size**-.5, loc=0)
ax.fill_between(x=np.arange(1,nlags), y1=pit_ci_bottom, y2=pit_ci_top, **kwargs)
ax.axhline(0, color='black')
ax.xaxis.set_major_locator(mpl.ticker.MaxNLocator(interger=True))
ax.set_ylim([-1,1])
ax.set_xlim([0, nlags-1])
I start by downloading the datata. I get the data from https://www.dhs.gov/immigration-statistics/yearbook/2016/table1, which contains the number of foreign nationals who are granted lawful permanent residence, admitted as temporary nonimmigrants, granted asylum or refugee status, or are naturalized.
data_in = pd.concat(pd.read_html('https://www.dhs.gov/immigration-statistics/yearbook/2016/table1', header=0),
axis=0)
data = pd.DataFrame(data_in['Number'].values, index=pd.DatetimeIndex([pd.to_datetime(str(val)[:4], format='%Y')
for val in data_in['Year']], freq='-1AS'), columns=['Number']).sort_index()
I now examine the data to ensure that we have imported the data adequately.
data.head()
data.tail()
I now plot the data to see if we how the immigration data change over time.
data.count()
overview_fig, overview_ax = mpl.pyplot.subplots()
overview_fig.set_size_inches((12,6.75))
overview_ax.plot(data)
overview_ax.set_ylabel('Number')
overview_ax.xaxis.set_major_locator(mpl.dates.AutoDateLocator(interval_multiples=True))
overview_fig.savefig("../doc/figures/immigration_overview.tmp.pdf", bbox_inches="tight", pad_inches=0,
frameon=False)
recession_dates = web.DataReader('USREC', 'fred', start='1800')
recession_dates.head()
Note, the data from FRED on the dates of U.S. Recessions ast determined by the National Bureau of Economic Research starts in 1854, not 1820 like the immigration data does.
fig2, ax2 = mpl.pyplot.subplots()
fig2.set_size_inches((12,6.75))
ax2.plot(data['1918':])
ax2.fill_between(recession_dates['1918':].index, np.asscalar(data.max()) * recession_dates['1918':].values.ravel(), color='gray',
alpha=0.5)
ax2.set_ylabel('Number')
ax2.xaxis.set_major_locator(mpl.dates.AutoDateLocator(interval_multiples=True))
fig2.savefig("../doc/figures/post_wwi_immigration_overview.tmp.pdf", bbox_inches="tight", pad_inches=0,
frameon=False)
The other graph that can be quite useful is a histogram.
fig3, ax3 = mpl.pyplot.subplots()
fig3.set_size_inches((12,6.75))
ax3.set_xlabel('Number')
ax3.set_ylabel('Count')
sns.distplot(data, kde=False, rug=True, ax=ax3)
fig3.savefig("../doc/figures/us_immigration_histogram.tmp.pdf", bbox_inches="tight", pad_inches=0,
frameon=False)
data.describe()
stats.describe(data)
These data are not in easily interpretable units, and they are always positive. It can often be useful to take logarithms for always positive data. We can also compute the z-scores to convert the units to have zero sample means and unit sample variances.
log_data = data.apply(np.log)
log_data.head()
fig4, ax4 = mpl.pyplot.subplots()
fig4.set_size_inches((12,6.75))
sns.distplot(log_data, kde=True, rug=True, ax=ax4, norm_hist=True, fit=stats.norm)
ax4.set_ylabel('Count')
fig4.savefig("../doc/figures/us_immigration_log_histogram.tmp.pdf", bbox_inches="tight", pad_inches=0,
frameon=False)
log_data.describe()
stats.describe(log_data)
fig5, ax5 = mpl.pyplot.subplots()
fig5.set_size_inches((12,6.75))
ax5.plot(log_data)
ax5.xaxis.set_major_locator(mpl.dates.AutoDateLocator(interval_multiples=True))
fig5.savefig("../doc/figures/us_immigration_log.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)
fig6, ax6 = mpl.pyplot.subplots()
fig6.set_size_inches((12,6.75))
plot_acf(log_data, ax=ax6, nlags=20, alpha=0.5, color='grey')
fig6.savefig("../doc/figures/us_immigration_log_acorr.tmp.pdf", bbox_inches="tight", pad_inches=0,
frameon=False)
The data are clearly highly persistent. The obvious question is how persistent. If they are persistent enough, essentially all of statistics is no longer applicable. For example, if they are a random walk, we can no longer event take means or variances
$$ y_t = y_{t-1} + \eta_t$$
We can actually estimate this and see if it works. (tsa.AR with 1 lag, just does OLS on the model above.)
model1 = tsa.ARIMA(log_data, order=(1,0,0)).fit(method='css', trend='c')
model1.params
model1.summary2()
If we are in a world where the data might be a random walk, standard tests do not work. The sample mean and sample varaince no longer converge to their population counterparts.
The Augmented Dickey-Fuller Test tests to see if we are a random walk. The p-value here is 0.08. We cannot reject a random walk at the 5% level.
tsa.stattools.adfuller(log_data.values.ravel(), autolag='AIC', regression='c')
The Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test checks if we can reject that the process is mean-reverting.
tsa.stattools.kpss(log_data.values.ravel())
Here we p-value is 0.0265 and so we can reject it. So what we will do moving forward is consider models of the following form.
$$ y_t = y_{t-1} + \eta_t$$
data_diff = log_data.diff().dropna()
fig6, ax6 = mpl.pyplot.subplots()
fig6.set_size_inches((12,6.75))
ax6.plot(data_diff)
ax6.xaxis.set_major_locator(mpl.dates.AutoDateLocator(interval_multiples=True))
fig6.savefig("../doc/figures/diffed_log_immigration.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)
fig7, ax7 = mpl.pyplot.subplots()
fig7.set_size_inches((12,6.75))
sns.distplot(data_diff, kde=True, rug=True, ax=ax7, norm_hist=True, fit=stats.norm)
fig8, ax8 = mpl.pyplot.subplots()
fig8.set_size_inches((12,6.75))
plot_acf(data_diff, ax=ax8, nlags=20, alpha=0.5, color='grey')
fig8.savefig("../doc/figures/migrant_diff_log_acorr.tmp.pdf", bbox_inches="tight", pad_inches=0,
frameon=False)
model2 = tsa.ARIMA(data_diff, order=(1,0,0)).fit(method='css')
model2.summary2()
That coefficient is rather small. Interesting. It turns out that you cannot fit cyclic behavior with just one lag. Let's try more, and see what happens.
fig9, ax9 = mpl.pyplot.subplots()
fig9.set_size_inches((12,6.75))
plot_acf(model2.resid, ax=ax9, nlags=20, alpha=0.5, color='grey')
fig9.savefig("../doc/figures/ar1_acorr.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)
model3 = tsa.ARIMA(data_diff, order=(2,0,0)).fit(method='css')
model3.summary2()
Look. The second lag is meaningful.
There still seems to be a pattern. Let's go crazy and try 5 lags.
model4 = tsa.ARIMA(data_diff, order=(3,0,0)).fit(method='css')
model4.summary2()
model5 = tsa.ARIMA(data_diff, order=(5,0,0)).fit(method='css', trend='c')
model5.summary2()
fig10, ax10 = mpl.pyplot.subplots()
fig10.set_size_inches((12,6.75))
plot_acf(model5.resid, ax=ax10, nlags=20, alpha=0.5, color='grey')
fig10.savefig("../doc/figures/ar5_acorr.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)
fig10, ax10 = mpl.pyplot.subplots()
fig10.set_size_inches((12,6.75))
sns.distplot(model4.resid, kde=True, rug=True, ax=ax10)
fig11, ax11 = mpl.pyplot.subplots()
fig11.set_size_inches((12,6.75))
ax11.plot(data_diff)
ax11.plot(model5.fittedvalues)
model6 = tsa.ARIMA(log_data, order=(5,1,0)).fit(method='css', trend='c')
model6.summary2()
model6.fittedvalues.head()
fig11, ax11 = mpl.pyplot.subplots()
fig11.set_size_inches((12,6.75))
fittedvalues = (log_data.T + model6.fittedvalues).T.dropna()
ax11.plot(data, label='Data')
ax11.plot(np.exp(fittedvalues), label='Fitted Values')
ax11.legend()
ax11.set_ylim([-500000, 2000000])
fig11.savefig("../doc/figures/ar5_fitted_values.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)
fig12, ax12 = mpl.pyplot.subplots()
fig12.set_size_inches((12,6.75))
fittedvalues = (log_data.T + model6.fittedvalues).T.dropna()
ax12.plot(data - np.exp(fittedvalues), label='Residuals')
ax12.set_ylim([-500000, 500000])
fig12.savefig("../doc/figures/ar5_residuals.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)
Copyright 2018 Paul Sangrey
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.