In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import statsmodels.tsa.api as tsa
import statsmodels.api as sm
import pandas_datareader.data as web
import seaborn as sns
from scipy import stats
In [2]:
%matplotlib inline
mpl.style.use('seaborn-talk')

I define a plot to plot autocorrelation functions and bartlett bands.

In [3]:
def plot_acf(data, ax, nlags=50, pct=.95, **kwargs):
    """
    This function plots the autocorrelation function of the data and the associated Barlett bands. 

    Under the null there is no dependence in the data and so the Barlett bands' width is constant.

    Parameters
    ----------
    data : pandas dataframe
    ax : matplotlib axes
    pct : float in [0,1]
    nlags : positive int
        The number of lags to plot
    kwargs : parameters to be passed to the seaborn distplot function.
    """

    ax.scatter(np.arange(1, nlags),sm.tsa.acf(x=data, unbiased=True, nlags=nlags-1)[1:])
    pit_ci_bottom, pit_ci_top = stats.norm.interval(alpha=pct, scale=data.size**-.5, loc=0) 

    ax.fill_between(x=np.arange(1,nlags), y1=pit_ci_bottom, y2=pit_ci_top, **kwargs)
    ax.axhline(0, color='black')
    ax.xaxis.set_major_locator(mpl.ticker.MaxNLocator(interger=True))

    ax.set_ylim([-1,1])
    ax.set_xlim([0, nlags-1])

Downloading Data

I start by downloading the datata. I get the data from https://www.dhs.gov/immigration-statistics/yearbook/2016/table1, which contains the number of foreign nationals who are granted lawful permanent residence, admitted as temporary nonimmigrants, granted asylum or refugee status, or are naturalized.

In [4]:
data_in = pd.concat(pd.read_html('https://www.dhs.gov/immigration-statistics/yearbook/2016/table1', header=0),
                 axis=0)
data = pd.DataFrame(data_in['Number'].values, index=pd.DatetimeIndex([pd.to_datetime(str(val)[:4], format='%Y') 
                                              for val in data_in['Year']], freq='-1AS'), columns=['Number']).sort_index()

I now examine the data to ensure that we have imported the data adequately.

In [5]:
data.head()
Out[5]:
Number
1820-01-01 8385
1821-01-01 9127
1822-01-01 6911
1823-01-01 6354
1824-01-01 7912
In [6]:
data.tail()
Out[6]:
Number
2012-01-01 1031631
2013-01-01 990553
2014-01-01 1016518
2015-01-01 1051031
2016-01-01 1183505

I now plot the data to see if we how the immigration data change over time.

In [7]:
data.count()
Out[7]:
Number    197
dtype: int64
In [8]:
overview_fig, overview_ax = mpl.pyplot.subplots()
overview_fig.set_size_inches((12,6.75))
overview_ax.plot(data)
overview_ax.set_ylabel('Number')
overview_ax.xaxis.set_major_locator(mpl.dates.AutoDateLocator(interval_multiples=True))
overview_fig.savefig("../doc/figures/immigration_overview.tmp.pdf", bbox_inches="tight", pad_inches=0, 
                     frameon=False)
In [9]:
recession_dates = web.DataReader('USREC', 'fred', start='1800')
In [10]:
recession_dates.head()
Out[10]:
USREC
DATE
1854-12-01 1
1855-01-01 0
1855-02-01 0
1855-03-01 0
1855-04-01 0

Note, the data from FRED on the dates of U.S. Recessions ast determined by the National Bureau of Economic Research starts in 1854, not 1820 like the immigration data does.

In [11]:
fig2, ax2 = mpl.pyplot.subplots()
fig2.set_size_inches((12,6.75))
ax2.plot(data['1918':])
ax2.fill_between(recession_dates['1918':].index, np.asscalar(data.max()) * recession_dates['1918':].values.ravel(), color='gray',
                 alpha=0.5)
ax2.set_ylabel('Number')
ax2.xaxis.set_major_locator(mpl.dates.AutoDateLocator(interval_multiples=True))
fig2.savefig("../doc/figures/post_wwi_immigration_overview.tmp.pdf", bbox_inches="tight", pad_inches=0, 
                     frameon=False)

The other graph that can be quite useful is a histogram.

In [12]:
fig3, ax3 = mpl.pyplot.subplots()
fig3.set_size_inches((12,6.75))
ax3.set_xlabel('Number')
ax3.set_ylabel('Count')
sns.distplot(data, kde=False, rug=True, ax=ax3)
fig3.savefig("../doc/figures/us_immigration_histogram.tmp.pdf", bbox_inches="tight", pad_inches=0, 
                     frameon=False)
In [13]:
data.describe()
Out[13]:
Number
count 1.970000e+02
mean 4.199727e+05
std 3.578186e+05
min 6.354000e+03
25% 1.411320e+05
50% 3.213500e+05
75% 5.950140e+05
max 1.826595e+06
In [14]:
stats.describe(data)
Out[14]:
DescribeResult(nobs=197, minmax=(array([6354]), array([1826595])), mean=array([ 419972.71573604]), variance=array([  1.28034141e+11]), skewness=array([ 1.0921735]), kurtosis=array([ 0.75893778]))

These data are not in easily interpretable units, and they are always positive. It can often be useful to take logarithms for always positive data. We can also compute the z-scores to convert the units to have zero sample means and unit sample variances.

In [15]:
log_data = data.apply(np.log)
In [16]:
log_data.head()
Out[16]:
Number
1820-01-01 9.034200
1821-01-01 9.118992
1822-01-01 8.840870
1823-01-01 8.756840
1824-01-01 8.976136
In [17]:
fig4, ax4 = mpl.pyplot.subplots()
fig4.set_size_inches((12,6.75))
sns.distplot(log_data, kde=True, rug=True, ax=ax4, norm_hist=True, fit=stats.norm)
ax4.set_ylabel('Count')
fig4.savefig("../doc/figures/us_immigration_log_histogram.tmp.pdf", bbox_inches="tight", pad_inches=0, 
                     frameon=False)
In [18]:
log_data.describe()
Out[18]:
Number
count 197.000000
mean 12.432715
std 1.225621
min 8.756840
25% 11.857451
50% 12.680286
75% 13.296340
max 14.417964
In [19]:
stats.describe(log_data)
Out[19]:
DescribeResult(nobs=197, minmax=(array([ 8.75683981]), array([ 14.41796414])), mean=array([ 12.4327154]), variance=array([ 1.50214659]), skewness=array([-0.99395649]), kurtosis=array([ 0.53347377]))
In [20]:
fig5, ax5 = mpl.pyplot.subplots()
fig5.set_size_inches((12,6.75))
ax5.plot(log_data)
ax5.xaxis.set_major_locator(mpl.dates.AutoDateLocator(interval_multiples=True))
fig5.savefig("../doc/figures/us_immigration_log.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)
In [21]:
fig6, ax6 = mpl.pyplot.subplots()
fig6.set_size_inches((12,6.75))
plot_acf(log_data, ax=ax6, nlags=20, alpha=0.5, color='grey')
fig6.savefig("../doc/figures/us_immigration_log_acorr.tmp.pdf", bbox_inches="tight", pad_inches=0, 
                     frameon=False)

The data are clearly highly persistent. The obvious question is how persistent. If they are persistent enough, essentially all of statistics is no longer applicable. For example, if they are a random walk, we can no longer event take means or variances

$$ y_t = y_{t-1} + \eta_t$$

We can actually estimate this and see if it works. (tsa.AR with 1 lag, just does OLS on the model above.)

In [22]:
model1 = tsa.ARIMA(log_data, order=(1,0,0)).fit(method='css', trend='c')
In [23]:
model1.params
Out[23]:
const           12.862607
ar.L1.Number     0.942317
dtype: float64
In [24]:
model1.summary2()
Out[24]:
Model: ARMA BIC: 156.5325
Dependent Variable: Number Log-Likelihood: -70.349
Date: 2018-08-12 14:54 Scale: 1.0000
No. Observations: 197 Method: css
Df Model: 2 Sample: 01-01-1821
Df Residuals: 194 01-01-2016
Converged: 1.0000 S.D. of innovations: 0.346
No. Iterations: 7.0000 HQIC: 150.680
AIC: 146.6982
Coef. Std.Err. t P>|t| [0.025 0.975]
const 12.8626 0.4558 28.2217 0.0000 11.9693 13.7559
ar.L1.Number 0.9423 0.0203 46.4781 0.0000 0.9026 0.9821
Real Imaginary Modulus Frequency
AR.1 1.0612 0.0000 1.0612 0.0000

If we are in a world where the data might be a random walk, standard tests do not work. The sample mean and sample varaince no longer converge to their population counterparts.

The Augmented Dickey-Fuller Test tests to see if we are a random walk. The p-value here is 0.08. We cannot reject a random walk at the 5% level.

In [25]:
tsa.stattools.adfuller(log_data.values.ravel(), autolag='AIC', regression='c')
Out[25]:
(-2.6650017819615757,
 0.080292741922777522,
 7,
 189,
 {'1%': -3.4654311561944873,
  '5%': -2.8769570530458792,
  '10%': -2.574988319755886},
 122.86134001451518)

The Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test checks if we can reject that the process is mean-reverting.

In [26]:
tsa.stattools.kpss(log_data.values.ravel())
Out[26]:
(0.56701512440823498,
 0.02657317017832545,
 15,
 {'10%': 0.347, '5%': 0.463, '2.5%': 0.574, '1%': 0.739})

Here we p-value is 0.0265 and so we can reject it. So what we will do moving forward is consider models of the following form.

$$ y_t = y_{t-1} + \eta_t$$

In [27]:
data_diff = log_data.diff().dropna()
In [28]:
fig6, ax6 = mpl.pyplot.subplots()
fig6.set_size_inches((12,6.75))
ax6.plot(data_diff)
ax6.xaxis.set_major_locator(mpl.dates.AutoDateLocator(interval_multiples=True))
fig6.savefig("../doc/figures/diffed_log_immigration.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)
In [29]:
fig7, ax7 = mpl.pyplot.subplots()
fig7.set_size_inches((12,6.75))
sns.distplot(data_diff, kde=True, rug=True, ax=ax7, norm_hist=True, fit=stats.norm)
Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe66045f518>
In [30]:
fig8, ax8 = mpl.pyplot.subplots()
fig8.set_size_inches((12,6.75))
plot_acf(data_diff, ax=ax8, nlags=20, alpha=0.5, color='grey')
fig8.savefig("../doc/figures/migrant_diff_log_acorr.tmp.pdf", bbox_inches="tight", pad_inches=0, 
                     frameon=False)
In [31]:
model2 = tsa.ARIMA(data_diff, order=(1,0,0)).fit(method='css')
In [32]:
model2.summary2()
Out[32]:
Model: ARMA BIC: 163.9452
Dependent Variable: Number Log-Likelihood: -74.063
Date: 2018-08-12 14:54 Scale: 1.0000
No. Observations: 196 Method: css
Df Model: 2 Sample: 01-01-1822
Df Residuals: 193 01-01-2016
Converged: 1.0000 S.D. of innovations: 0.354
No. Iterations: 6.0000 HQIC: 158.102
AIC: 154.1262
Coef. Std.Err. t P>|t| [0.025 0.975]
const 0.0250 0.0270 0.9256 0.3558 -0.0279 0.0778
ar.L1.Number 0.0605 0.0715 0.8462 0.3985 -0.0796 0.2006
Real Imaginary Modulus Frequency
AR.1 16.5302 0.0000 16.5302 0.0000

That coefficient is rather small. Interesting. It turns out that you cannot fit cyclic behavior with just one lag. Let's try more, and see what happens.

In [33]:
fig9, ax9 = mpl.pyplot.subplots()
fig9.set_size_inches((12,6.75))

plot_acf(model2.resid, ax=ax9, nlags=20, alpha=0.5, color='grey')
fig9.savefig("../doc/figures/ar1_acorr.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)
In [34]:
model3 = tsa.ARIMA(data_diff, order=(2,0,0)).fit(method='css')
In [35]:
model3.summary2()
Out[35]:
Model: ARMA BIC: 164.3034
Dependent Variable: Number Log-Likelihood: -71.616
Date: 2018-08-12 14:54 Scale: 1.0000
No. Observations: 196 Method: css
Df Model: 3 Sample: 01-01-1823
Df Residuals: 191 01-01-2016
Converged: 1.0000 S.D. of innovations: 0.350
No. Iterations: 8.0000 HQIC: 156.525
AIC: 151.2320
Coef. Std.Err. t P>|t| [0.025 0.975]
const 0.0264 0.0233 1.1333 0.2585 -0.0193 0.0721
ar.L1.Number 0.0703 0.0709 0.9916 0.3227 -0.0686 0.2092
ar.L2.Number -0.1491 0.0709 -2.1044 0.0366 -0.2880 -0.0102
Real Imaginary Modulus Frequency
AR.1 0.2356 -2.5789 2.5896 -0.2355
AR.2 0.2356 2.5789 2.5896 0.2355

Look. The second lag is meaningful.

There still seems to be a pattern. Let's go crazy and try 5 lags.

In [36]:
model4 = tsa.ARIMA(data_diff, order=(3,0,0)).fit(method='css')
In [37]:
model4.summary2()
Out[37]:
Model: ARMA BIC: 166.3304
Dependent Variable: Number Log-Likelihood: -70.008
Date: 2018-08-12 14:54 Scale: 1.0000
No. Observations: 196 Method: css
Df Model: 4 Sample: 01-01-1824
Df Residuals: 189 01-01-2016
Converged: 1.0000 S.D. of innovations: 0.348
No. Iterations: 13.0000 HQIC: 156.623
AIC: 150.0169
Coef. Std.Err. t P>|t| [0.025 0.975]
const 0.0271 0.0267 1.0128 0.3125 -0.0253 0.0794
ar.L1.Number 0.0891 0.0713 1.2482 0.2135 -0.0508 0.2289
ar.L2.Number -0.1581 0.0706 -2.2393 0.0263 -0.2964 -0.0197
ar.L3.Number 0.1323 0.0712 1.8579 0.0647 -0.0073 0.2719
Real Imaginary Modulus Frequency
AR.1 -0.5600 -1.7181 1.8071 -0.3001
AR.2 -0.5600 1.7181 1.8071 0.3001
AR.3 2.3148 -0.0000 2.3148 -0.0000
In [38]:
model5 = tsa.ARIMA(data_diff, order=(5,0,0)).fit(method='css', trend='c')
In [39]:
model5.summary2()
Out[39]:
Model: ARMA BIC: 165.3972
Dependent Variable: Number Log-Likelihood: -64.316
Date: 2018-08-12 14:54 Scale: 1.0000
No. Observations: 196 Method: css
Df Model: 6 Sample: 01-01-1826
Df Residuals: 185 01-01-2016
Converged: 1.0000 S.D. of innovations: 0.339
No. Iterations: 21.0000 HQIC: 151.853
AIC: 142.6313
Coef. Std.Err. t P>|t| [0.025 0.975]
const 0.0250 0.0198 1.2616 0.2087 -0.0138 0.0639
ar.L1.Number 0.0806 0.0704 1.1443 0.2540 -0.0574 0.2186
ar.L2.Number -0.1365 0.0705 -1.9358 0.0544 -0.2747 0.0017
ar.L3.Number 0.1031 0.0708 1.4567 0.1469 -0.0356 0.2419
ar.L4.Number -0.0584 0.0704 -0.8305 0.4073 -0.1963 0.0795
ar.L5.Number -0.2254 0.0702 -3.2093 0.0016 -0.3631 -0.0877
Real Imaginary Modulus Frequency
AR.1 1.0641 -0.7874 1.3237 -0.1014
AR.2 1.0641 0.7874 1.3237 0.1014
AR.3 -0.4017 -1.1987 1.2642 -0.3015
AR.4 -0.4017 1.1987 1.2642 0.3015
AR.5 -1.5840 -0.0000 1.5840 -0.5000
In [40]:
fig10, ax10 = mpl.pyplot.subplots()
fig10.set_size_inches((12,6.75))
plot_acf(model5.resid, ax=ax10, nlags=20, alpha=0.5, color='grey')
fig10.savefig("../doc/figures/ar5_acorr.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)
In [41]:
fig10, ax10 = mpl.pyplot.subplots()
fig10.set_size_inches((12,6.75))
sns.distplot(model4.resid, kde=True, rug=True, ax=ax10)
Out[41]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe65e971588>
In [42]:
fig11, ax11 = mpl.pyplot.subplots()
fig11.set_size_inches((12,6.75))
ax11.plot(data_diff)
ax11.plot(model5.fittedvalues)
Out[42]:
[<matplotlib.lines.Line2D at 0x7fe65e88a898>]
In [43]:
model6 = tsa.ARIMA(log_data, order=(5,1,0)).fit(method='css', trend='c')
In [44]:
model6.summary2()
Out[44]:
Model: ARIMA BIC: 165.3972
Dependent Variable: D.Number Log-Likelihood: -64.316
Date: 2018-08-12 14:54 Scale: 1.0000
No. Observations: 196 Method: css
Df Model: 6 Sample: 01-01-1826
Df Residuals: 185 01-01-2016
Converged: 1.0000 S.D. of innovations: 0.339
No. Iterations: 21.0000 HQIC: 151.853
AIC: 142.6313
Coef. Std.Err. t P>|t| [0.025 0.975]
const 0.0250 0.0198 1.2616 0.2087 -0.0138 0.0639
ar.L1.D.Number 0.0806 0.0704 1.1443 0.2540 -0.0574 0.2186
ar.L2.D.Number -0.1365 0.0705 -1.9358 0.0544 -0.2747 0.0017
ar.L3.D.Number 0.1031 0.0708 1.4567 0.1469 -0.0356 0.2419
ar.L4.D.Number -0.0584 0.0704 -0.8305 0.4073 -0.1963 0.0795
ar.L5.D.Number -0.2254 0.0702 -3.2093 0.0016 -0.3631 -0.0877
Real Imaginary Modulus Frequency
AR.1 1.0641 -0.7874 1.3237 -0.1014
AR.2 1.0641 0.7874 1.3237 0.1014
AR.3 -0.4017 -1.1987 1.2642 -0.3015
AR.4 -0.4017 1.1987 1.2642 0.3015
AR.5 -1.5840 -0.0000 1.5840 -0.5000
In [45]:
 model6.fittedvalues.head()
Out[45]:
1826-01-01    0.009934
1827-01-01    0.091388
1828-01-01    0.099674
1829-01-01   -0.072830
1830-01-01   -0.039143
Freq: AS-JAN, dtype: float64
In [46]:
fig11, ax11 = mpl.pyplot.subplots()
fig11.set_size_inches((12,6.75))
fittedvalues = (log_data.T + model6.fittedvalues).T.dropna()
ax11.plot(data, label='Data')
ax11.plot(np.exp(fittedvalues), label='Fitted Values')
ax11.legend()
ax11.set_ylim([-500000, 2000000])
fig11.savefig("../doc/figures/ar5_fitted_values.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)
In [48]:
fig12, ax12 = mpl.pyplot.subplots()
fig12.set_size_inches((12,6.75))
fittedvalues = (log_data.T + model6.fittedvalues).T.dropna()
ax12.plot(data - np.exp(fittedvalues), label='Residuals')
ax12.set_ylim([-500000, 500000])
fig12.savefig("../doc/figures/ar5_residuals.tmp.pdf", bbox_inches="tight", pad_inches=0, frameon=False)

Copyright 2018 Paul Sangrey

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.