In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import numpy as np
import statsmodels.api as sm
import warnings
warnings.filterwarnings("ignore")
df= pd.read_csv('air traffic.csv')
df.head()
Out[1]:
| Year | Month | Dom_Pax | Int_Pax | Pax | Dom_Flt | Int_Flt | Flt | Dom_RPM | Int_RPM | RPM | Dom_ASM | Int_ASM | ASM | Dom_LF | Int_LF | LF | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2003 | 1 | 43,032,450 | 4,905,830 | 47,938,280 | 785,160 | 57,667 | 842,827 | 36,211,422 | 12,885,980 | 49,097,402 | 56,191,300 | 17,968,572 | 74,159,872 | 64.44 | 71.71 | 66.20 |
| 1 | 2003 | 2 | 41,166,780 | 4,245,366 | 45,412,146 | 690,351 | 51,259 | 741,610 | 34,148,439 | 10,715,468 | 44,863,907 | 50,088,434 | 15,587,880 | 65,676,314 | 68.18 | 68.74 | 68.31 |
| 2 | 2003 | 3 | 49,992,700 | 5,008,613 | 55,001,313 | 797,194 | 58,926 | 856,120 | 41,774,564 | 12,567,068 | 54,341,633 | 57,592,901 | 17,753,174 | 75,346,075 | 72.53 | 70.79 | 72.12 |
| 3 | 2003 | 4 | 47,033,260 | 4,345,444 | 51,378,704 | 766,260 | 55,005 | 821,265 | 39,465,980 | 10,370,592 | 49,836,572 | 54,639,679 | 15,528,761 | 70,168,440 | 72.23 | 66.78 | 71.02 |
| 4 | 2003 | 5 | 49,152,352 | 4,610,834 | 53,763,186 | 789,397 | 55,265 | 844,662 | 41,001,934 | 11,575,026 | 52,576,960 | 55,349,897 | 15,629,821 | 70,979,718 | 74.08 | 74.06 | 74.07 |
In [3]:
# 数据类型转换
for col in df.columns:
if df[col].dtype == 'object':
df[col] = pd.to_numeric(df[col].str.replace(',', ''), errors='coerce')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 249 entries, 0 to 248 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Year 249 non-null int64 1 Month 249 non-null int64 2 Dom_Pax 249 non-null int64 3 Int_Pax 249 non-null int64 4 Pax 249 non-null int64 5 Dom_Flt 249 non-null int64 6 Int_Flt 249 non-null int64 7 Flt 249 non-null int64 8 Dom_RPM 249 non-null int64 9 Int_RPM 249 non-null int64 10 RPM 249 non-null int64 11 Dom_ASM 249 non-null int64 12 Int_ASM 249 non-null int64 13 ASM 249 non-null int64 14 Dom_LF 249 non-null float64 15 Int_LF 249 non-null float64 16 LF 249 non-null float64 dtypes: float64(3), int64(14) memory usage: 33.2 KB
In [4]:
# 由于我们正在做时间序列预测,现在让我们处理月份和年份列
# 这里我们将合并月份和年份,同时将日期设置为每月的第一天
df['Date'] = pd.to_datetime({'year': df['Year'], 'month': df['Month'], 'day': 1})
# 我们把日期设为索引
df.set_index('Date')
Out[4]:
| Year | Month | Dom_Pax | Int_Pax | Pax | Dom_Flt | Int_Flt | Flt | Dom_RPM | Int_RPM | RPM | Dom_ASM | Int_ASM | ASM | Dom_LF | Int_LF | LF | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Date | |||||||||||||||||
| 2003-01-01 | 2003 | 1 | 43032450 | 4905830 | 47938280 | 785160 | 57667 | 842827 | 36211422 | 12885980 | 49097402 | 56191300 | 17968572 | 74159872 | 64.44 | 71.71 | 66.20 |
| 2003-02-01 | 2003 | 2 | 41166780 | 4245366 | 45412146 | 690351 | 51259 | 741610 | 34148439 | 10715468 | 44863907 | 50088434 | 15587880 | 65676314 | 68.18 | 68.74 | 68.31 |
| 2003-03-01 | 2003 | 3 | 49992700 | 5008613 | 55001313 | 797194 | 58926 | 856120 | 41774564 | 12567068 | 54341633 | 57592901 | 17753174 | 75346075 | 72.53 | 70.79 | 72.12 |
| 2003-04-01 | 2003 | 4 | 47033260 | 4345444 | 51378704 | 766260 | 55005 | 821265 | 39465980 | 10370592 | 49836572 | 54639679 | 15528761 | 70168440 | 72.23 | 66.78 | 71.02 |
| 2003-05-01 | 2003 | 5 | 49152352 | 4610834 | 53763186 | 789397 | 55265 | 844662 | 41001934 | 11575026 | 52576960 | 55349897 | 15629821 | 70979718 | 74.08 | 74.06 | 74.07 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2023-05-01 | 2023 | 5 | 71423653 | 10358666 | 81782319 | 667331 | 71924 | 739255 | 66743565 | 26805432 | 93548998 | 77821407 | 31950687 | 109772094 | 85.77 | 83.90 | 85.22 |
| 2023-06-01 | 2023 | 6 | 72482621 | 11544505 | 84027126 | 661293 | 75279 | 736572 | 68789127 | 29883465 | 98672591 | 78058358 | 33410671 | 111469028 | 88.13 | 89.44 | 88.52 |
| 2023-07-01 | 2023 | 7 | 75378157 | 12432615 | 87810772 | 684939 | 79738 | 764677 | 72267904 | 31376000 | 103643904 | 81986010 | 35326191 | 117312202 | 88.15 | 88.82 | 88.35 |
| 2023-08-01 | 2023 | 8 | 71477988 | 11572149 | 83050137 | 691482 | 77137 | 768619 | 67933484 | 29938507 | 97871992 | 81997399 | 34908793 | 116906192 | 82.85 | 85.76 | 83.72 |
| 2023-09-01 | 2023 | 9 | 66858490 | 9392985 | 76251475 | 649308 | 64241 | 713549 | 61777546 | 26076318 | 87853864 | 75748336 | 31231710 | 106980046 | 81.56 | 83.49 | 82.12 |
249 rows × 17 columns
In [6]:
# 让我们看看所有变量相对于日期的趋势
all_columns = [col for col in df.columns if col != 'Date']
for column in all_columns:
plt.figure(figsize=(10, 4))
sns.lineplot(data=df, x='Date', y=column)
plt.title(column)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
In [7]:
# 我们来做一个新的更简单的df
new_df = df[['Date', 'Dom_RPM']].copy()
new_df['Date'] = pd.to_datetime(new_df['Date'])
new_df.set_index('Date', inplace=True)
# 将数据分成训练集和测试集
split_date = pd.to_datetime('2023-01-01')
train = new_df.loc[:split_date]
test = new_df.loc[split_date:]
In [8]:
plt.figure(figsize=(12,6))
plt.plot(new_df.index, new_df['Dom_RPM'], label='Dom_RPM')
plt.axvline(x=split_date, color='red', linestyle='--', label='Train-Test Split')
plt.legend()
plt.show()
In [10]:
# 平稳性检验
from statsmodels.tsa.stattools import adfuller
st_train= adfuller(train['Dom_RPM'], autolag='AIC')
print(st_train)
"""预差分结果表明,在5%显著性水平下,时间序列可能是平稳的,因为ADF统计量(-3.4227)更接近5%临界值(-2.8744),p值(0.0102)小于0.05,表明在该水平下可以拒绝单位根的原假设。"""
(-3.422669601629956, 0.010209919011204107, 13, 227, {'1%': -3.4594900381360034, '5%': -2.8743581895178485, '10%': -2.573601605503697}, 7463.150034343683)
Out[10]:
'预差分结果表明,在5%显著性水平下,时间序列可能是平稳的,因为ADF统计量(-3.4227)更接近5%临界值(-2.8744),p值(0.0102)小于0.05,表明在该水平下可以拒绝单位根的原假设。'
In [11]:
plt.figure(figsize=(12, 6))
plt.subplot(2, 1, 1)
pd.plotting.autocorrelation_plot(train['Dom_RPM'])
plt.title(f"Autocorrelation Plot for 'Training Set'")
plt.show()
In [12]:
# 差分训练数据
train['Dom_RPM_diff'] = train['Dom_RPM'].diff().shift(-1)
train.dropna(inplace=True)
# 检查差分后的ADF结果
diff_st_train= adfuller(train['Dom_RPM'], autolag='AIC')
print(diff_st_train)
(-3.534624452976126, 0.007141138016109203, 13, 226, {'1%': -3.4596204846395824, '5%': -2.8744153028455948, '10%': -2.5736320761218576}, 7429.093256516401)
In [13]:
from statsmodels.tsa.arima.model import ARIMA
import statsmodels.api as sm
plt.plot(train.index, train['Dom_RPM_diff'])
plt.xlabel('Date')
plt.ylabel('Domestic RPM')
plt.xticks(rotation=45)
plt.show()
In [15]:
order = (1, 1, 1)
arima_model = sm.tsa.arima.ARIMA(train['Dom_RPM_diff'].dropna(), order=order)
fitted = arima_model.fit()
print(fitted.summary())
SARIMAX Results
==============================================================================
Dep. Variable: Dom_RPM_diff No. Observations: 240
Model: ARIMA(1, 1, 1) Log Likelihood -4058.061
Date: Sun, 17 Nov 2024 AIC 8122.122
Time: 14:23:19 BIC 8132.552
Sample: 01-01-2003 HQIC 8126.325
- 12-01-2022
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 -0.0813 0.052 -1.571 0.116 -0.183 0.020
ma.L1 -0.9967 0.076 -13.055 0.000 -1.146 -0.847
sigma2 4.056e+13 1.3e-15 3.11e+28 0.000 4.06e+13 4.06e+13
===================================================================================
Ljung-Box (L1) (Q): 0.01 Jarque-Bera (JB): 261.51
Prob(Q): 0.92 Prob(JB): 0.00
Heteroskedasticity (H): 2.38 Skew: -0.82
Prob(H) (two-sided): 0.00 Kurtosis: 7.85
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 1.42e+43. Standard errors may be unstable.
In [16]:
# 分解时间序列以观察季节模式
decomposition = sm.tsa.seasonal_decompose(train['Dom_RPM'], model='additive', period=12)
decomposition.plot()
plt.show()
# ACF和PACF图
fig, (ax1, ax2) = plt.subplots(2,1, figsize=(12,8))
sm.graphics.tsa.plot_acf(train['Dom_RPM'], lags=40, ax=ax1)
sm.graphics.tsa.plot_pacf(train['Dom_RPM'], lags=40, ax=ax2)
plt.show()
In [17]:
# 非季节性组件
p = 1 # number of lag observations in the model (AR term)
d = 1 # the number of times that the raw observations are differenced (I term)
q = 0 # the size of the moving average window (MA term)
# 季节性组件
P = 0 # seasonal autoregressive order
D = 1 # essasonal differencing order
Q = 1 # seasonal moving average order
s = 12 # the number of periods in a season
seasonal_order = (P, D, Q, s)
sarima_model = sm.tsa.statespace.SARIMAX(train['Dom_RPM'],
order=(p, d, q),
seasonal_order=seasonal_order,
enforce_stationarity=False,
enforce_invertibility=False)
fitted_sarima = sarima_model.fit()
print(fitted_sarima.summary())
SARIMAX Results
============================================================================================
Dep. Variable: Dom_RPM No. Observations: 240
Model: SARIMAX(1, 1, 0)x(0, 1, [1], 12) Log Likelihood -3547.654
Date: Sun, 17 Nov 2024 AIC 7101.307
Time: 14:23:55 BIC 7111.405
Sample: 01-01-2003 HQIC 7105.388
- 12-01-2022
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 0.3927 0.032 12.149 0.000 0.329 0.456
ma.S.L12 -0.6053 0.030 -20.049 0.000 -0.664 -0.546
sigma2 2.006e+13 4.64e-16 4.33e+28 0.000 2.01e+13 2.01e+13
===================================================================================
Ljung-Box (L1) (Q): 0.82 Jarque-Bera (JB): 14702.13
Prob(Q): 0.37 Prob(JB): 0.00
Heteroskedasticity (H): 7.38 Skew: -3.66
Prob(H) (two-sided): 0.00 Kurtosis: 42.94
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 2.29e+43. Standard errors may be unstable.
In [18]:
residuals = fitted_sarima.resid
# 绘制残差
plt.figure(figsize=(10,6))
plt.plot(residuals)
plt.title('Residuals from SARIMA Model')
plt.axhline(y=0, color='black', linestyle='--')
plt.show() # 绘制残差的ACF和PACF
fig, ax = plt.subplots(1,2,figsize=(15,4))
sm.graphics.tsa.plot_acf(residuals, lags=40, ax=ax[0])
sm.graphics.tsa.plot_pacf(residuals, lags=40, ax=ax[1])
plt.show()
In [19]:
# 直方图和核密度估计
residuals.plot(kind='hist', density=True, bins=30, alpha=0.5)
residuals.plot(kind='kde')
plt.title('Histogram and Density Plot of Residuals')
plt.show()
In [20]:
# Q-Q图检查正态性
sm.qqplot(residuals, line='s')
plt.title('Q-Q Plot of Residuals')
plt.show()
In [21]:
# 正态性的统计检验
from scipy import stats
jb_test = stats.jarque_bera(residuals)
print(f'Jarque-Bera test statistics: {jb_test[0]}, p-value: {jb_test[1]}')
Jarque-Bera test statistics: 6792.5399581520915, p-value: 0.0
In [22]:
# 零均值的统计检验
mean_test = stats.ttest_1samp(residuals, 0)
print(f'Test Statistic: {mean_test.statistic}, p-value: {mean_test.pvalue}')
Test Statistic: -0.1012239727859493, p-value: 0.9194575337351738
In [23]:
# 自相关的统计检验
ljung_box_test = sm.stats.acorr_ljungbox(residuals, lags=[40], return_df=True)
print(ljung_box_test)
lb_stat lb_pvalue 40 35.096743 0.690385
In [24]:
covid_start = '2020-03-01'
covid_end = '2021-09-01'
# 为covid周期创建虚拟变量
train['covid_dummy'] = np.where((train.index >= covid_start) & (train.index <= covid_end), 1, 0)
# 带有外生变量(哑变量)的SARIMA模型
sarimax_model1 = sm.tsa.statespace.SARIMAX(train['Dom_RPM'],
exog=train['covid_dummy'],
order=(p, d, q),
seasonal_order=(P, D, Q, s),
enforce_stationarity=False,
enforce_invertibility=False)
fitted_sarimax1 = sarimax_model1.fit()
print(fitted_sarimax1.summary())
SARIMAX Results
============================================================================================
Dep. Variable: Dom_RPM No. Observations: 240
Model: SARIMAX(1, 1, 0)x(0, 1, [1], 12) Log Likelihood -3526.067
Date: Sun, 17 Nov 2024 AIC 7060.134
Time: 14:25:03 BIC 7073.598
Sample: 01-01-2003 HQIC 7065.574
- 12-01-2022
Covariance Type: opg
===============================================================================
coef std err z P>|z| [0.025 0.975]
-------------------------------------------------------------------------------
covid_dummy -1.936e+07 6.63e+05 -29.201 0.000 -2.07e+07 -1.81e+07
ar.L1 0.2554 0.034 7.490 0.000 0.189 0.322
ma.S.L12 -0.5813 0.030 -19.413 0.000 -0.640 -0.523
sigma2 1.603e+13 0.071 2.25e+14 0.000 1.6e+13 1.6e+13
===================================================================================
Ljung-Box (L1) (Q): 0.00 Jarque-Bera (JB): 2602.93
Prob(Q): 0.98 Prob(JB): 0.00
Heteroskedasticity (H): 7.57 Skew: -1.90
Prob(H) (two-sided): 0.00 Kurtosis: 19.66
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 5.19e+29. Standard errors may be unstable.
In [25]:
# 让我们再试一次,但要用衰变
covid_start = pd.to_datetime('2020-03-01')
covid_end = pd.to_datetime('2021-09-01')
# 计算自COVID-19开始以来的天数,并将其除以30转换为月份
train['months_since_covid'] = ((train.index - covid_start) / np.timedelta64(1, 'D') / 30).astype(int)
# 线性衰减
duration_in_months = ((covid_end - covid_start) / np.timedelta64(1, 'D') / 30)
train['covid_impact'] = train['months_since_covid'].apply(
lambda x: max(1 - x / duration_in_months, 0) if x >= 0 else 0
)
In [26]:
sarimax_model_with_decay = sm.tsa.statespace.SARIMAX(
train['Dom_RPM'],
exog=train[['covid_dummy', 'covid_impact']],
order=(p, d, q),
seasonal_order=(P, D, Q, s),
enforce_stationarity=False,
enforce_invertibility=False
)
fitted_sarimax_with_decay = sarimax_model_with_decay.fit()
print(fitted_sarimax_with_decay.summary())
SARIMAX Results
============================================================================================
Dep. Variable: Dom_RPM No. Observations: 240
Model: SARIMAX(1, 1, 0)x(0, 1, [1], 12) Log Likelihood -3527.070
Date: Sun, 17 Nov 2024 AIC 7064.140
Time: 14:25:27 BIC 7080.969
Sample: 01-01-2003 HQIC 7070.940
- 12-01-2022
Covariance Type: opg
================================================================================
coef std err z P>|z| [0.025 0.975]
--------------------------------------------------------------------------------
covid_dummy -1.934e+07 6.64e+05 -29.114 0.000 -2.06e+07 -1.8e+07
covid_impact -2.007e+06 8.57e+04 -23.411 0.000 -2.17e+06 -1.84e+06
ar.L1 0.2352 0.035 6.700 0.000 0.166 0.304
ma.S.L12 -0.5838 0.030 -19.598 0.000 -0.642 -0.525
sigma2 1.625e+13 0.027 5.92e+14 0.000 1.63e+13 1.63e+13
===================================================================================
Ljung-Box (L1) (Q): 0.01 Jarque-Bera (JB): 2841.04
Prob(Q): 0.93 Prob(JB): 0.00
Heteroskedasticity (H): 7.63 Skew: -2.02
Prob(H) (two-sided): 0.00 Kurtosis: 20.38
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 4.74e+30. Standard errors may be unstable.
In [27]:
# 预测期数(未来12个月)
n_periods = 12
future_dates = pd.date_range(train.index[-1] + pd.offsets.MonthBegin(), periods=n_periods, freq='M')
# 假设新冠病毒开始消失,那么影响将从0.5降至0
future_covid_dummy = [0] * n_periods
future_covid_impact = np.linspace(start=0.5, stop=0, num=n_periods) #
# 组合成df
future_exog = pd.DataFrame({'covid_dummy': future_covid_dummy, 'covid_impact': future_covid_impact}, index=future_dates)
n_periods = 12
forecast = fitted_sarimax_with_decay.get_forecast(steps=n_periods, exog=future_exog)
forecast_mean = forecast.predicted_mean
forecast_conf_int = forecast.conf_int()
In [28]:
# 利用历史数据绘制预测图
plt.figure(figsize=(10, 6))
plt.plot(train.index, train['Dom_RPM'], label='Historical')
plt.plot(pd.date_range(train.index[-1], periods=n_periods, freq='M'), forecast_mean, label='Forecast')
plt.fill_between(pd.date_range(train.index[-1], periods=n_periods, freq='M'), forecast_conf_int.iloc[:, 0], forecast_conf_int.iloc[:, 1], color='red', alpha=0.3)
plt.legend()
plt.show()
In [29]:
# 让我们把我们的预测和测试进行比较
plt.figure(figsize=(10, 6))
plt.plot(train.index, train['Dom_RPM'], label='Historical', color='blue')
plt.plot(forecast_mean.index, forecast_mean, label='Forecast', color='red')
plt.fill_between(forecast_conf_int.index, forecast_conf_int.iloc[:, 0], forecast_conf_int.iloc[:, 1], color='red', alpha=0.3)
plt.plot(test.index, test['Dom_RPM'], label='Actual Test Set', color='green')
plt.title('Comparison between the Test and Forecast')
plt.xlabel('Date')
plt.ylabel('Dom_RPM (in millions)')
plt.legend()
plt.show()
In [ ]: