import numpy as np
import pandas as pd
import matplotlib as mpl
import statsmodels.tsa.api as tsa
import statsmodels.api as sm
import pandas_datareader.data as web
import seaborn as sns
from scipy import stats

%matplotlib inline
mpl.style.use('seaborn-talk')

I define a plot to plot autocorrelation functions and bartlett bands.¶

def plot_acf(data, ax, nlags=50, pct=.95, **kwargs):
    """
    This function plots the autocorrelation function of the data and the associated Barlett bands. 

    Under the null there is no dependence in the data and so the Barlett bands' width is constant.

    Parameters
    ----------
    data : pandas dataframe
    ax : matplotlib axes
    pct : float in [0,1]
    nlags : positive int
        The number of lags to plot
    kwargs : parameters to be passed to the seaborn distplot function.
    """

    ax.scatter(np.arange(1, nlags),sm.tsa.acf(x=data, unbiased=True, nlags=nlags-1)[1:])
    pit_ci_bottom, pit_ci_top = stats.norm.interval(alpha=pct, scale=data.size**-.5, loc=0) 

    ax.fill_between(x=np.arange(1,nlags), y1=pit_ci_bottom, y2=pit_ci_top, **kwargs)
    ax.axhline(0, color='black')
    ax.xaxis.set_major_locator(mpl.ticker.MaxNLocator(interger=True))

    ax.set_ylim([-1,1])
    ax.set_xlim([0, nlags-1])

Downloading Data¶

I start by downloading the datata. I get the data from https://www.dhs.gov/immigration-statistics/yearbook/2016/table1, which contains the number of foreign nationals who are granted lawful permanent residence, admitted as temporary nonimmigrants, granted asylum or refugee status, or are naturalized.

data_in = pd.concat(pd.read_html('https://www.dhs.gov/immigration-statistics/yearbook/2016/table1', header=0),
                 axis=0)
data = pd.DataFrame(data_in['Number'].values, index=pd.DatetimeIndex([pd.to_datetime(str(val)[:4], format='%Y') 
                                              for val in data_in['Year']], freq='-1AS'), columns=['Number']).sort_index()

I now examine the data to ensure that we have imported the data adequately.

data.head()

data.tail()

I now plot the data to see if we how the immigration data change over time.

data.count()

Number    197
dtype: int64

overview_fig, overview_ax = mpl.pyplot.subplots()
overview_fig.set_size_inches((12,6.75))
overview_ax.plot(data)
overview_ax.set_ylabel('Number')
overview_ax.xaxis.set_major_locator(mpl.dates.AutoDateLocator(interval_multiples=True))
overview_fig.savefig("../doc/figures/immigration_overview.tmp.pdf", bbox_inches="tight", pad_inches=0, 
                     frameon=False)

recession_dates = web.DataReader('USREC', 'fred', start='1800')

recession_dates.head()

Note, the data from FRED on the dates of U.S. Recessions ast determined by the National Bureau of Economic Research starts in 1854, not 1820 like the immigration data does.

fig2, ax2 = mpl.pyplot.subplots()
fig2.set_size_inches((12,6.75))
ax2.plot(data['1918':])
ax2.fill_between(recession_dates['1918':].index, np.asscalar(data.max()) * recession_dates['1918':].values.ravel(), color='gray',
                 alpha=0.5)
ax2.set_ylabel('Number')
ax2.xaxis.set_major_locator(mpl.dates.AutoDateLocator(interval_multiples=True))
fig2.savefig("../doc/figures/post_wwi_immigration_overview.tmp.pdf", bbox_inches="tight", pad_inches=0, 
                     frameon=False)

The other graph that can be quite useful is a histogram.

fig3, ax3 = mpl.pyplot.subplots()
fig3.set_size_inches((12,6.75))
ax3.set_xlabel('Number')
ax3.set_ylabel('Count')
sns.distplot(data, kde=False, rug=True, ax=ax3)
fig3.savefig("../doc/figures/us_immigration_histogram.tmp.pdf", bbox_inches="tight", pad_inches=0, 
                     frameon=False)

data.describe()

stats.describe(data)

DescribeResult(nobs=197, minmax=(array([6354]), array([1826595])), mean=array([ 419972.71573604]), variance=array([  1.28034141e+11]), skewness=array([ 1.0921735]), kurtosis=array([ 0.75893778]))

These data are not in easily interpretable units, and they are always positive. It can often be useful to take logarithms for always positive data. We can also compute the z-scores to convert the units to have zero sample means and unit sample variances.

log_data = data.apply(np.log)

log_data.head()

fig4, ax4 = mpl.pyplot.subplots()
fig4.set_size_inches((12,6.75))
sns.distplot(log_data, kde=True, rug=True, ax=ax4, norm_hist=True, fit=stats.norm)
ax4.set_ylabel('Count')
fig4.savefig("../doc/figures/us_immigration_log_histogram.tmp.pdf", bbox_inches="tight", pad_inches=0, 
                     frameon=False)

log_data.describe()

stats.describe(log_data)

DescribeResult(nobs=197, minmax=(array([ 8.75683981]), array([ 14.41796414])), mean=array([ 12.4327154]), variance=array([ 1.50214659]), skewness=array([-0.99395649]), kurtosis=array([ 0.53347377]))

fig5, ax5 = mpl.pyplot.subplots()
fig5.set_size_inches((12,6.75))
ax5.plot(log_data)
ax5.xaxis.set_major_locator(mpl.dates.AutoDateLocator(interval_multiples=True))
fig5.savefig("../doc/figures/us_immigration_log.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)

fig6, ax6 = mpl.pyplot.subplots()
fig6.set_size_inches((12,6.75))
plot_acf(log_data, ax=ax6, nlags=20, alpha=0.5, color='grey')
fig6.savefig("../doc/figures/us_immigration_log_acorr.tmp.pdf", bbox_inches="tight", pad_inches=0, 
                     frameon=False)

The data are clearly highly persistent. The obvious question is how persistent. If they are persistent enough, essentially all of statistics is no longer applicable. For example, if they are a random walk, we can no longer event take means or variances

$$ y_t = y_{t-1} + \eta_t$$

We can actually estimate this and see if it works. (tsa.AR with 1 lag, just does OLS on the model above.)

model1 = tsa.ARIMA(log_data, order=(1,0,0)).fit(method='css', trend='c')

model1.params

const           12.862607
ar.L1.Number     0.942317
dtype: float64

model1.summary2()

If we are in a world where the data might be a random walk, standard tests do not work. The sample mean and sample varaince no longer converge to their population counterparts.

The Augmented Dickey-Fuller Test tests to see if we are a random walk. The p-value here is 0.08. We cannot reject a random walk at the 5% level.

tsa.stattools.adfuller(log_data.values.ravel(), autolag='AIC', regression='c')

(-2.6650017819615757,
 0.080292741922777522,
 7,
 189,
 {'1%': -3.4654311561944873,
  '5%': -2.8769570530458792,
  '10%': -2.574988319755886},
 122.86134001451518)

The Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test checks if we can reject that the process is mean-reverting.

tsa.stattools.kpss(log_data.values.ravel())

(0.56701512440823498,
 0.02657317017832545,
 15,
 {'10%': 0.347, '5%': 0.463, '2.5%': 0.574, '1%': 0.739})

Here we p-value is 0.0265 and so we can reject it. So what we will do moving forward is consider models of the following form.

$$ y_t = y_{t-1} + \eta_t$$

data_diff = log_data.diff().dropna()

fig6, ax6 = mpl.pyplot.subplots()
fig6.set_size_inches((12,6.75))
ax6.plot(data_diff)
ax6.xaxis.set_major_locator(mpl.dates.AutoDateLocator(interval_multiples=True))
fig6.savefig("../doc/figures/diffed_log_immigration.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)

fig7, ax7 = mpl.pyplot.subplots()
fig7.set_size_inches((12,6.75))
sns.distplot(data_diff, kde=True, rug=True, ax=ax7, norm_hist=True, fit=stats.norm)

<matplotlib.axes._subplots.AxesSubplot at 0x7fe66045f518>

fig8, ax8 = mpl.pyplot.subplots()
fig8.set_size_inches((12,6.75))
plot_acf(data_diff, ax=ax8, nlags=20, alpha=0.5, color='grey')
fig8.savefig("../doc/figures/migrant_diff_log_acorr.tmp.pdf", bbox_inches="tight", pad_inches=0, 
                     frameon=False)

model2 = tsa.ARIMA(data_diff, order=(1,0,0)).fit(method='css')

model2.summary2()

That coefficient is rather small. Interesting. It turns out that you cannot fit cyclic behavior with just one lag. Let's try more, and see what happens.

fig9, ax9 = mpl.pyplot.subplots()
fig9.set_size_inches((12,6.75))

plot_acf(model2.resid, ax=ax9, nlags=20, alpha=0.5, color='grey')
fig9.savefig("../doc/figures/ar1_acorr.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)

model3 = tsa.ARIMA(data_diff, order=(2,0,0)).fit(method='css')

model3.summary2()

Look. The second lag is meaningful.

There still seems to be a pattern. Let's go crazy and try 5 lags.

model4 = tsa.ARIMA(data_diff, order=(3,0,0)).fit(method='css')

model4.summary2()

model5 = tsa.ARIMA(data_diff, order=(5,0,0)).fit(method='css', trend='c')

model5.summary2()

fig10, ax10 = mpl.pyplot.subplots()
fig10.set_size_inches((12,6.75))
plot_acf(model5.resid, ax=ax10, nlags=20, alpha=0.5, color='grey')
fig10.savefig("../doc/figures/ar5_acorr.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)

fig10, ax10 = mpl.pyplot.subplots()
fig10.set_size_inches((12,6.75))
sns.distplot(model4.resid, kde=True, rug=True, ax=ax10)

<matplotlib.axes._subplots.AxesSubplot at 0x7fe65e971588>

fig11, ax11 = mpl.pyplot.subplots()
fig11.set_size_inches((12,6.75))
ax11.plot(data_diff)
ax11.plot(model5.fittedvalues)

[<matplotlib.lines.Line2D at 0x7fe65e88a898>]

model6 = tsa.ARIMA(log_data, order=(5,1,0)).fit(method='css', trend='c')

model6.summary2()

 model6.fittedvalues.head()

1826-01-01    0.009934
1827-01-01    0.091388
1828-01-01    0.099674
1829-01-01   -0.072830
1830-01-01   -0.039143
Freq: AS-JAN, dtype: float64

fig11, ax11 = mpl.pyplot.subplots()
fig11.set_size_inches((12,6.75))
fittedvalues = (log_data.T + model6.fittedvalues).T.dropna()
ax11.plot(data, label='Data')
ax11.plot(np.exp(fittedvalues), label='Fitted Values')
ax11.legend()
ax11.set_ylim([-500000, 2000000])
fig11.savefig("../doc/figures/ar5_fitted_values.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)

fig12, ax12 = mpl.pyplot.subplots()
fig12.set_size_inches((12,6.75))
fittedvalues = (log_data.T + model6.fittedvalues).T.dropna()
ax12.plot(data - np.exp(fittedvalues), label='Residuals')
ax12.set_ylim([-500000, 500000])
fig12.savefig("../doc/figures/ar5_residuals.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

	Number
1820-01-01	8385
1821-01-01	9127
1822-01-01	6911
1823-01-01	6354
1824-01-01	7912

	Number
2012-01-01	1031631
2013-01-01	990553
2014-01-01	1016518
2015-01-01	1051031
2016-01-01	1183505

	USREC
DATE
1854-12-01	1
1855-01-01	0
1855-02-01	0
1855-03-01	0
1855-04-01	0

	Number
count	1.970000e+02
mean	4.199727e+05
std	3.578186e+05
min	6.354000e+03
25%	1.411320e+05
50%	3.213500e+05
75%	5.950140e+05
max	1.826595e+06

	Number
1820-01-01	9.034200
1821-01-01	9.118992
1822-01-01	8.840870
1823-01-01	8.756840
1824-01-01	8.976136

	Number
count	197.000000
mean	12.432715
std	1.225621
min	8.756840
25%	11.857451
50%	12.680286
75%	13.296340
max	14.417964

Model:	ARMA	BIC:	156.5325
Dependent Variable:	Number	Log-Likelihood:	-70.349
Date:	2018-08-12 14:54	Scale:	1.0000
No. Observations:	197	Method:	css
Df Model:	2	Sample:	01-01-1821
Df Residuals:	194		01-01-2016
Converged:	1.0000	S.D. of innovations:	0.346
No. Iterations:	7.0000	HQIC:	150.680
AIC:	146.6982

	Coef.	Std.Err.	t	P>\|t\|	[0.025	0.975]
const	12.8626	0.4558	28.2217	0.0000	11.9693	13.7559
ar.L1.Number	0.9423	0.0203	46.4781	0.0000	0.9026	0.9821

	Real	Imaginary	Modulus	Frequency
AR.1	0.2356	-2.5789	2.5896	-0.2355
AR.2	0.2356	2.5789	2.5896	0.2355

Model:	ARIMA	BIC:	165.3972
Dependent Variable:	D.Number	Log-Likelihood:	-64.316
Date:	2018-08-12 14:54	Scale:	1.0000
No. Observations:	196	Method:	css
Df Model:	6	Sample:	01-01-1826
Df Residuals:	185		01-01-2016
Converged:	1.0000	S.D. of innovations:	0.339
No. Iterations:	21.0000	HQIC:	151.853
AIC:	142.6313

	Coef.	Std.Err.	t	P>\|t\|	[0.025	0.975]
const	0.0250	0.0270	0.9256	0.3558	-0.0279	0.0778
ar.L1.Number	0.0605	0.0715	0.8462	0.3985	-0.0796	0.2006

	Coef.	Std.Err.	t	P>\|t\|	[0.025	0.975]
const	0.0264	0.0233	1.1333	0.2585	-0.0193	0.0721
ar.L1.Number	0.0703	0.0709	0.9916	0.3227	-0.0686	0.2092
ar.L2.Number	-0.1491	0.0709	-2.1044	0.0366	-0.2880	-0.0102

	Coef.	Std.Err.	t	P>\|t\|	[0.025	0.975]
const	0.0271	0.0267	1.0128	0.3125	-0.0253	0.0794
ar.L1.Number	0.0891	0.0713	1.2482	0.2135	-0.0508	0.2289
ar.L2.Number	-0.1581	0.0706	-2.2393	0.0263	-0.2964	-0.0197
ar.L3.Number	0.1323	0.0712	1.8579	0.0647	-0.0073	0.2719

	Real	Imaginary	Modulus	Frequency
AR.1	-0.5600	-1.7181	1.8071	-0.3001
AR.2	-0.5600	1.7181	1.8071	0.3001
AR.3	2.3148	-0.0000	2.3148	-0.0000