import matplotlib.pyplot as plt
import numpy as np
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
import pandas as pd
def perform_adf_test(series):
    result = adfuller(series)
    print('ADF Statistic: %f' % result[0])
    print('p-value: %f' % result[1])

10: Undo Stationarity Transformation#

Original Series#

ts = pd.read_csv('../data/original_series.csv')
ts.index = np.arange(1,len(ts)+1)
plt.figure(figsize=(10,4))
plt.plot(ts)

plt.xticks(np.arange(0,78,6), fontsize=14)
plt.xlabel('Hours Since Published', fontsize=16)

plt.yticks(np.arange(0,50000,10000), fontsize=14)
plt.ylabel('Views', fontsize=16)
Text(0, 0.5, 'Views')
../../../_images/38d4a8731c5a08c1933957344547d088a12d7328e20822a5d264e34b3f45a415.png

Original Series: \(v_t\)

  1. Normalize (\(v_t \rightarrow n_t\)): \(n_t = \frac{v_t - \mu}{\sigma}\)

  2. Exponentiate (\(n_t \rightarrow e_t\)): \(e_t = e^{n_t}\)

  3. First Difference (\(e_t \rightarrow d_t\)): \(d_t = e_t - e_{t-1}\)

\(d_t = e^{\frac{v_t - \mu}{\sigma}} - e^{\frac{v_{t-1} - \mu}{\sigma}}\)#

1. Normalize#

mu = np.mean(ts).iloc[0]
sigma = np.std(ts).iloc[0]

norm_ts = (ts - mu) / sigma
plt.figure(figsize=(10,4))
plt.plot(norm_ts)

plt.xticks(np.arange(0,78,6), fontsize=14)
plt.xlabel('Hours Since Published', fontsize=16)

plt.yticks(np.arange(-3,2), fontsize=14)
plt.ylabel('Norm. Views', fontsize=16)

plt.axhline(0, color='k', linestyle='--')
<matplotlib.lines.Line2D at 0x7f5c58be18e0>
../../../_images/1b402a6c7112993638239600614e4b671e227ab54fd4dd808fb219fa660e6b61.png

2. Exponentiate#

exp_ts = np.exp(norm_ts)
plt.figure(figsize=(10,4))
plt.plot(exp_ts)

plt.xticks(np.arange(0,78,6), fontsize=14)
plt.xlabel('Hours Since Published', fontsize=16)

plt.yticks(np.arange(0,3.5,.5), fontsize=14)
plt.ylabel('Exp. Norm. Views', fontsize=16)
Text(0, 0.5, 'Exp. Norm. Views')
../../../_images/6cf37afa8b4f48680a2f52cbbfbd9d3508c0cf8047402fa8b08de703984151cb.png
perform_adf_test(exp_ts)
ADF Statistic: 1.648979
p-value: 0.997997

3. First Difference#

diff_ts = exp_ts.diff().dropna()
plt.figure(figsize=(10,4))
plt.plot(diff_ts)

plt.xticks(np.arange(0,78,6), fontsize=14)
plt.xlabel('Hours Since Published', fontsize=16)

plt.yticks(np.arange(-0.2,0.3,.1), fontsize=14)
plt.ylabel('First Diff. \nExp. Norm. Views', fontsize=16)
Text(0, 0.5, 'First Diff. \nExp. Norm. Views')
../../../_images/7c67a51f5a27a9c9054ed67e1a7035c3a6ab21f2e55bf5fc243b942bcacd899e.png
perform_adf_test(diff_ts)
ADF Statistic: -4.881064
p-value: 0.000038

Fit AR Model#

plot_pacf(diff_ts)
plt.show()
/home/ubuntu/Documents/Projects/msci_data/.venv/lib/python3.9/site-packages/statsmodels/graphics/tsaplots.py:348: FutureWarning: The default method 'yw' can produce PACF values outside of the [-1,1] interval. After 0.13, the default will change tounadjusted Yule-Walker ('ywm'). You can use this method now by setting method='ywm'.
  warnings.warn(
../../../_images/0ebdf95d1f59cbc3cdc8478a3f11d8379489d50ca2343b6856696812b9397afe.png
plot_acf(diff_ts)
plt.show()
../../../_images/4943c33a4771b1c92f86352e8b2de9a7d6ae87962a4e38a771f22717cd4b6ebc.png
#create the model
model = ARIMA(diff_ts, order=(4,1,0))
/home/ubuntu/Documents/Projects/msci_data/.venv/lib/python3.9/site-packages/statsmodels/tsa/base/tsa_model.py:471: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
  self._init_dates(dates, freq)
/home/ubuntu/Documents/Projects/msci_data/.venv/lib/python3.9/site-packages/statsmodels/tsa/base/tsa_model.py:471: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
  self._init_dates(dates, freq)
/home/ubuntu/Documents/Projects/msci_data/.venv/lib/python3.9/site-packages/statsmodels/tsa/base/tsa_model.py:471: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
  self._init_dates(dates, freq)
model_fit = model.fit()

Predict Out 3 Hours#

predictions = model_fit.forecast(3)
/home/ubuntu/Documents/Projects/msci_data/.venv/lib/python3.9/site-packages/statsmodels/tsa/base/tsa_model.py:834: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`.
  return get_prediction_index(
plt.figure(figsize=(10,4))
plt.plot(diff_ts)

plt.xticks(np.arange(0,78,6), fontsize=14)
plt.xlabel('Hours Since Published', fontsize=16)

plt.yticks(np.arange(-0.2,0.3,.1), fontsize=14)
plt.ylabel('First Diff. \nExp. Norm. Views', fontsize=16)

plt.plot(np.arange(len(ts)+1, len(ts)+4), predictions, color='g')
#plt.fill_between(np.arange(len(ts)+1, len(ts)+4), lower_bound, upper_bound, color='g', alpha=0.1)
[<matplotlib.lines.Line2D at 0x7f5c54ef9e50>]
../../../_images/2794ee617d3b13c51a49c6672edf31f2333e9a174c82dad511f97059b5223b31.png

Undo Transformations:#

\(\hat{d}_{t+1} \rightarrow \hat{v}_{t+1}\)

\(\hat{v}_{t+1} = \sigma \ln(\hat{d}_{t+1} + e^{\frac{v_t - \mu}{\sigma}}) + \mu\)

def undo_transformations(predictions, series, mu, sigma):
    first_pred = sigma*np.log(predictions.iloc[0] + np.exp((series.iloc[-1]-mu)/sigma)) + mu
    orig_predictions = [first_pred]
    
    for i in range(len(predictions.iloc[1:])):
        next_pred = sigma*np.log(predictions.iloc[i+1] + np.exp((orig_predictions[-1]-mu)/sigma)) + mu
        orig_predictions.append(next_pred)
    
    return np.array(orig_predictions).flatten()
orig_preds = undo_transformations(predictions, ts, mu, sigma)

#orig_lower_bound = undo_transformations(lower_bound, ts, mu, sigma)
#orig_upper_bound = undo_transformations(upper_bound, ts, mu, sigma)
plt.figure(figsize=(10,4))
plt.plot(ts)

plt.xticks(np.arange(0,78,6), fontsize=14)
plt.xlabel('Hours Since Published', fontsize=16)

plt.yticks(np.arange(0,50000,10000), fontsize=14)
plt.ylabel('Views', fontsize=16)

plt.plot(np.arange(len(ts)+1, len(ts)+4), orig_preds, color='g')
#plt.fill_between(np.arange(len(ts)+1, len(ts)+4), orig_lower_bound, orig_upper_bound, color='g', alpha=0.1)
[<matplotlib.lines.Line2D at 0x7f5c54e7cb50>]
../../../_images/7cc8646045bd6ed2de45c753a4541d819d032f6bd9a8107866509235d6ede0e7.png
plt.figure(figsize=(10,4))
plt.plot(ts)

plt.xticks(np.arange(0,78), fontsize=14)
plt.xlabel('Hours Since Published', fontsize=16)

plt.yticks(np.arange(40000,46000,1000), fontsize=14)
plt.ylabel('Views', fontsize=16)

plt.plot(np.arange(len(ts)+1, len(ts)+4), orig_preds, color='g')
#plt.fill_between(np.arange(len(ts)+1, len(ts)+4), orig_lower_bound, orig_upper_bound, color='g', alpha=0.1)
plt.xlim(64,76)
plt.ylim(40000, 45000)
(40000.0, 45000.0)
../../../_images/5df4b280ad523713dacfc29a5130e898b0d28c4147702abbfa99e5f41946b56f.png