import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import numpy as np
import statsmodels.api as sm
import warnings
warnings.filterwarnings("ignore")
df= pd.read_csv('air traffic.csv')
df.head()

# 数据类型转换
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = pd.to_numeric(df[col].str.replace(',', ''), errors='coerce')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249 entries, 0 to 248
Data columns (total 17 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Year     249 non-null    int64  
 1   Month    249 non-null    int64  
 2   Dom_Pax  249 non-null    int64  
 3   Int_Pax  249 non-null    int64  
 4   Pax      249 non-null    int64  
 5   Dom_Flt  249 non-null    int64  
 6   Int_Flt  249 non-null    int64  
 7   Flt      249 non-null    int64  
 8   Dom_RPM  249 non-null    int64  
 9   Int_RPM  249 non-null    int64  
 10  RPM      249 non-null    int64  
 11  Dom_ASM  249 non-null    int64  
 12  Int_ASM  249 non-null    int64  
 13  ASM      249 non-null    int64  
 14  Dom_LF   249 non-null    float64
 15  Int_LF   249 non-null    float64
 16  LF       249 non-null    float64
dtypes: float64(3), int64(14)
memory usage: 33.2 KB

# 由于我们正在做时间序列预测，现在让我们处理月份和年份列
# 这里我们将合并月份和年份，同时将日期设置为每月的第一天
df['Date'] = pd.to_datetime({'year': df['Year'], 'month': df['Month'], 'day': 1})
# 我们把日期设为索引
df.set_index('Date')

# 让我们看看所有变量相对于日期的趋势
all_columns = [col for col in df.columns if col != 'Date']
for column in all_columns:
    plt.figure(figsize=(10, 4))
    sns.lineplot(data=df, x='Date', y=column)
    plt.title(column)
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 我们来做一个新的更简单的df
new_df = df[['Date', 'Dom_RPM']].copy()
new_df['Date'] = pd.to_datetime(new_df['Date'])
new_df.set_index('Date', inplace=True)
# 将数据分成训练集和测试集
split_date = pd.to_datetime('2023-01-01')
train = new_df.loc[:split_date]
test = new_df.loc[split_date:]

plt.figure(figsize=(12,6))
plt.plot(new_df.index, new_df['Dom_RPM'], label='Dom_RPM')
plt.axvline(x=split_date, color='red', linestyle='--', label='Train-Test Split')
plt.legend()
plt.show()

# 平稳性检验
from statsmodels.tsa.stattools import adfuller
st_train= adfuller(train['Dom_RPM'], autolag='AIC')
print(st_train)
"""预差分结果表明，在5%显著性水平下，时间序列可能是平稳的，因为ADF统计量(-3.4227)更接近5%临界值(-2.8744)，p值(0.0102)小于0.05，表明在该水平下可以拒绝单位根的原假设。"""

(-3.422669601629956, 0.010209919011204107, 13, 227, {'1%': -3.4594900381360034, '5%': -2.8743581895178485, '10%': -2.573601605503697}, 7463.150034343683)

'预差分结果表明，在5%显著性水平下，时间序列可能是平稳的，因为ADF统计量(-3.4227)更接近5%临界值(-2.8744)，p值(0.0102)小于0.05，表明在该水平下可以拒绝单位根的原假设。'

plt.figure(figsize=(12, 6))
plt.subplot(2, 1, 1)
pd.plotting.autocorrelation_plot(train['Dom_RPM'])
plt.title(f"Autocorrelation Plot for 'Training Set'")
plt.show()

# 差分训练数据
train['Dom_RPM_diff'] = train['Dom_RPM'].diff().shift(-1)
train.dropna(inplace=True)
# 检查差分后的ADF结果
diff_st_train= adfuller(train['Dom_RPM'], autolag='AIC')
print(diff_st_train)

(-3.534624452976126, 0.007141138016109203, 13, 226, {'1%': -3.4596204846395824, '5%': -2.8744153028455948, '10%': -2.5736320761218576}, 7429.093256516401)

from statsmodels.tsa.arima.model import ARIMA
import statsmodels.api as sm
plt.plot(train.index, train['Dom_RPM_diff'])
plt.xlabel('Date')
plt.ylabel('Domestic RPM')
plt.xticks(rotation=45)
plt.show()

order = (1, 1, 1)
arima_model = sm.tsa.arima.ARIMA(train['Dom_RPM_diff'].dropna(), order=order)
fitted = arima_model.fit()
print(fitted.summary())

                               SARIMAX Results                                
==============================================================================
Dep. Variable:           Dom_RPM_diff   No. Observations:                  240
Model:                 ARIMA(1, 1, 1)   Log Likelihood               -4058.061
Date:                Sun, 17 Nov 2024   AIC                           8122.122
Time:                        14:23:19   BIC                           8132.552
Sample:                    01-01-2003   HQIC                          8126.325
                         - 12-01-2022                                         
Covariance Type:                  opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.0813      0.052     -1.571      0.116      -0.183       0.020
ma.L1         -0.9967      0.076    -13.055      0.000      -1.146      -0.847
sigma2      4.056e+13    1.3e-15   3.11e+28      0.000    4.06e+13    4.06e+13
===================================================================================
Ljung-Box (L1) (Q):                   0.01   Jarque-Bera (JB):               261.51
Prob(Q):                              0.92   Prob(JB):                         0.00
Heteroskedasticity (H):               2.38   Skew:                            -0.82
Prob(H) (two-sided):                  0.00   Kurtosis:                         7.85
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 1.42e+43. Standard errors may be unstable.

# 分解时间序列以观察季节模式
decomposition = sm.tsa.seasonal_decompose(train['Dom_RPM'], model='additive', period=12)
decomposition.plot()
plt.show()
# ACF和PACF图
fig, (ax1, ax2) = plt.subplots(2,1, figsize=(12,8))
sm.graphics.tsa.plot_acf(train['Dom_RPM'], lags=40, ax=ax1)
sm.graphics.tsa.plot_pacf(train['Dom_RPM'], lags=40, ax=ax2)
plt.show()

# 非季节性组件 
p = 1 # number of lag observations in the model (AR term)
d = 1 # the number of times that the raw observations are differenced (I term)
q = 0  # the size of the moving average window (MA term)
# 季节性组件
P = 0  # seasonal autoregressive order
D = 1  # essasonal differencing order
Q = 1  # seasonal moving average order
s = 12 # the number of periods in a season
seasonal_order = (P, D, Q, s)
sarima_model = sm.tsa.statespace.SARIMAX(train['Dom_RPM'],
order=(p, d, q),
seasonal_order=seasonal_order,
enforce_stationarity=False,
enforce_invertibility=False)
fitted_sarima = sarima_model.fit()
print(fitted_sarima.summary())

                                      SARIMAX Results                                       
============================================================================================
Dep. Variable:                              Dom_RPM   No. Observations:                  240
Model:             SARIMAX(1, 1, 0)x(0, 1, [1], 12)   Log Likelihood               -3547.654
Date:                              Sun, 17 Nov 2024   AIC                           7101.307
Time:                                      14:23:55   BIC                           7111.405
Sample:                                  01-01-2003   HQIC                          7105.388
                                       - 12-01-2022                                         
Covariance Type:                                opg                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.3927      0.032     12.149      0.000       0.329       0.456
ma.S.L12      -0.6053      0.030    -20.049      0.000      -0.664      -0.546
sigma2      2.006e+13   4.64e-16   4.33e+28      0.000    2.01e+13    2.01e+13
===================================================================================
Ljung-Box (L1) (Q):                   0.82   Jarque-Bera (JB):             14702.13
Prob(Q):                              0.37   Prob(JB):                         0.00
Heteroskedasticity (H):               7.38   Skew:                            -3.66
Prob(H) (two-sided):                  0.00   Kurtosis:                        42.94
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 2.29e+43. Standard errors may be unstable.

residuals = fitted_sarima.resid
# 绘制残差
plt.figure(figsize=(10,6))
plt.plot(residuals)
plt.title('Residuals from SARIMA Model')
plt.axhline(y=0, color='black', linestyle='--')
plt.show() # 绘制残差的ACF和PACF
fig, ax = plt.subplots(1,2,figsize=(15,4))
sm.graphics.tsa.plot_acf(residuals, lags=40, ax=ax[0])
sm.graphics.tsa.plot_pacf(residuals, lags=40, ax=ax[1])
plt.show()

# 直方图和核密度估计
residuals.plot(kind='hist', density=True, bins=30, alpha=0.5)
residuals.plot(kind='kde')
plt.title('Histogram and Density Plot of Residuals')
plt.show()

# Q-Q图检查正态性
sm.qqplot(residuals, line='s')
plt.title('Q-Q Plot of Residuals')
plt.show()

# 正态性的统计检验
from scipy import stats
jb_test = stats.jarque_bera(residuals)
print(f'Jarque-Bera test statistics: {jb_test[0]}, p-value: {jb_test[1]}')

Jarque-Bera test statistics: 6792.5399581520915, p-value: 0.0

# 零均值的统计检验
mean_test = stats.ttest_1samp(residuals, 0)
print(f'Test Statistic: {mean_test.statistic}, p-value: {mean_test.pvalue}')

Test Statistic: -0.1012239727859493, p-value: 0.9194575337351738

# 自相关的统计检验
ljung_box_test = sm.stats.acorr_ljungbox(residuals, lags=[40], return_df=True)
print(ljung_box_test)

      lb_stat  lb_pvalue
40  35.096743   0.690385

covid_start = '2020-03-01'
covid_end = '2021-09-01'
# 为covid周期创建虚拟变量
train['covid_dummy'] = np.where((train.index >= covid_start) & (train.index <= covid_end), 1, 0)
# 带有外生变量(哑变量)的SARIMA模型
sarimax_model1 = sm.tsa.statespace.SARIMAX(train['Dom_RPM'],
exog=train['covid_dummy'],
order=(p, d, q),
seasonal_order=(P, D, Q, s),
enforce_stationarity=False,
enforce_invertibility=False)
fitted_sarimax1 = sarimax_model1.fit()
print(fitted_sarimax1.summary())

                                      SARIMAX Results                                       
============================================================================================
Dep. Variable:                              Dom_RPM   No. Observations:                  240
Model:             SARIMAX(1, 1, 0)x(0, 1, [1], 12)   Log Likelihood               -3526.067
Date:                              Sun, 17 Nov 2024   AIC                           7060.134
Time:                                      14:25:03   BIC                           7073.598
Sample:                                  01-01-2003   HQIC                          7065.574
                                       - 12-01-2022                                         
Covariance Type:                                opg                                         
===============================================================================
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
covid_dummy -1.936e+07   6.63e+05    -29.201      0.000   -2.07e+07   -1.81e+07
ar.L1           0.2554      0.034      7.490      0.000       0.189       0.322
ma.S.L12       -0.5813      0.030    -19.413      0.000      -0.640      -0.523
sigma2       1.603e+13      0.071   2.25e+14      0.000     1.6e+13     1.6e+13
===================================================================================
Ljung-Box (L1) (Q):                   0.00   Jarque-Bera (JB):              2602.93
Prob(Q):                              0.98   Prob(JB):                         0.00
Heteroskedasticity (H):               7.57   Skew:                            -1.90
Prob(H) (two-sided):                  0.00   Kurtosis:                        19.66
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 5.19e+29. Standard errors may be unstable.

# 让我们再试一次，但要用衰变
covid_start = pd.to_datetime('2020-03-01')
covid_end = pd.to_datetime('2021-09-01')
# 计算自COVID-19开始以来的天数，并将其除以30转换为月份
train['months_since_covid'] = ((train.index - covid_start) / np.timedelta64(1, 'D') / 30).astype(int)
# 线性衰减
duration_in_months = ((covid_end - covid_start) / np.timedelta64(1, 'D') / 30)
train['covid_impact'] = train['months_since_covid'].apply(
lambda x: max(1 - x / duration_in_months, 0) if x >= 0 else 0
)

sarimax_model_with_decay = sm.tsa.statespace.SARIMAX(
train['Dom_RPM'],
exog=train[['covid_dummy', 'covid_impact']],
order=(p, d, q),
seasonal_order=(P, D, Q, s),
enforce_stationarity=False,
enforce_invertibility=False
)
fitted_sarimax_with_decay = sarimax_model_with_decay.fit()
print(fitted_sarimax_with_decay.summary())

                                      SARIMAX Results                                       
============================================================================================
Dep. Variable:                              Dom_RPM   No. Observations:                  240
Model:             SARIMAX(1, 1, 0)x(0, 1, [1], 12)   Log Likelihood               -3527.070
Date:                              Sun, 17 Nov 2024   AIC                           7064.140
Time:                                      14:25:27   BIC                           7080.969
Sample:                                  01-01-2003   HQIC                          7070.940
                                       - 12-01-2022                                         
Covariance Type:                                opg                                         
================================================================================
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
covid_dummy  -1.934e+07   6.64e+05    -29.114      0.000   -2.06e+07    -1.8e+07
covid_impact -2.007e+06   8.57e+04    -23.411      0.000   -2.17e+06   -1.84e+06
ar.L1            0.2352      0.035      6.700      0.000       0.166       0.304
ma.S.L12        -0.5838      0.030    -19.598      0.000      -0.642      -0.525
sigma2        1.625e+13      0.027   5.92e+14      0.000    1.63e+13    1.63e+13
===================================================================================
Ljung-Box (L1) (Q):                   0.01   Jarque-Bera (JB):              2841.04
Prob(Q):                              0.93   Prob(JB):                         0.00
Heteroskedasticity (H):               7.63   Skew:                            -2.02
Prob(H) (two-sided):                  0.00   Kurtosis:                        20.38
===================================================================================

Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
[2] Covariance matrix is singular or near-singular, with condition number 4.74e+30. Standard errors may be unstable.

# 预测期数(未来12个月)
n_periods = 12
future_dates = pd.date_range(train.index[-1] + pd.offsets.MonthBegin(), periods=n_periods, freq='M')
# 假设新冠病毒开始消失，那么影响将从0.5降至0
future_covid_dummy = [0] * n_periods
future_covid_impact = np.linspace(start=0.5, stop=0, num=n_periods)  #
# 组合成df
future_exog = pd.DataFrame({'covid_dummy': future_covid_dummy, 'covid_impact': future_covid_impact}, index=future_dates)
n_periods = 12
forecast = fitted_sarimax_with_decay.get_forecast(steps=n_periods, exog=future_exog)
forecast_mean = forecast.predicted_mean
forecast_conf_int = forecast.conf_int()

# 利用历史数据绘制预测图
plt.figure(figsize=(10, 6))
plt.plot(train.index, train['Dom_RPM'], label='Historical')
plt.plot(pd.date_range(train.index[-1], periods=n_periods, freq='M'), forecast_mean, label='Forecast')
plt.fill_between(pd.date_range(train.index[-1], periods=n_periods, freq='M'), forecast_conf_int.iloc[:, 0], forecast_conf_int.iloc[:, 1], color='red', alpha=0.3)
plt.legend()
plt.show()

# 让我们把我们的预测和测试进行比较
plt.figure(figsize=(10, 6))
plt.plot(train.index, train['Dom_RPM'], label='Historical', color='blue')
plt.plot(forecast_mean.index, forecast_mean, label='Forecast', color='red')
plt.fill_between(forecast_conf_int.index, forecast_conf_int.iloc[:, 0], forecast_conf_int.iloc[:, 1], color='red', alpha=0.3)
plt.plot(test.index, test['Dom_RPM'], label='Actual Test Set', color='green')
plt.title('Comparison between the Test and Forecast')
plt.xlabel('Date')
plt.ylabel('Dom_RPM (in millions)')
plt.legend()
plt.show()

	Year	Month	Dom_Pax	Int_Pax	Pax	Dom_Flt	Int_Flt	Flt	Dom_RPM	Int_RPM	RPM	Dom_ASM	Int_ASM	ASM	Dom_LF	Int_LF	LF
0	2003	1	43,032,450	4,905,830	47,938,280	785,160	57,667	842,827	36,211,422	12,885,980	49,097,402	56,191,300	17,968,572	74,159,872	64.44	71.71	66.20
1	2003	2	41,166,780	4,245,366	45,412,146	690,351	51,259	741,610	34,148,439	10,715,468	44,863,907	50,088,434	15,587,880	65,676,314	68.18	68.74	68.31
2	2003	3	49,992,700	5,008,613	55,001,313	797,194	58,926	856,120	41,774,564	12,567,068	54,341,633	57,592,901	17,753,174	75,346,075	72.53	70.79	72.12
3	2003	4	47,033,260	4,345,444	51,378,704	766,260	55,005	821,265	39,465,980	10,370,592	49,836,572	54,639,679	15,528,761	70,168,440	72.23	66.78	71.02
4	2003	5	49,152,352	4,610,834	53,763,186	789,397	55,265	844,662	41,001,934	11,575,026	52,576,960	55,349,897	15,629,821	70,979,718	74.08	74.06	74.07

	Year	Month	Dom_Pax	Int_Pax	Pax	Dom_Flt	Int_Flt	Flt	Dom_RPM	Int_RPM	RPM	Dom_ASM	Int_ASM	ASM	Dom_LF	Int_LF	LF
Date
2003-01-01	2003	1	43032450	4905830	47938280	785160	57667	842827	36211422	12885980	49097402	56191300	17968572	74159872	64.44	71.71	66.20
2003-02-01	2003	2	41166780	4245366	45412146	690351	51259	741610	34148439	10715468	44863907	50088434	15587880	65676314	68.18	68.74	68.31
2003-03-01	2003	3	49992700	5008613	55001313	797194	58926	856120	41774564	12567068	54341633	57592901	17753174	75346075	72.53	70.79	72.12
2003-04-01	2003	4	47033260	4345444	51378704	766260	55005	821265	39465980	10370592	49836572	54639679	15528761	70168440	72.23	66.78	71.02
2003-05-01	2003	5	49152352	4610834	53763186	789397	55265	844662	41001934	11575026	52576960	55349897	15629821	70979718	74.08	74.06	74.07
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2023-05-01	2023	5	71423653	10358666	81782319	667331	71924	739255	66743565	26805432	93548998	77821407	31950687	109772094	85.77	83.90	85.22
2023-06-01	2023	6	72482621	11544505	84027126	661293	75279	736572	68789127	29883465	98672591	78058358	33410671	111469028	88.13	89.44	88.52
2023-07-01	2023	7	75378157	12432615	87810772	684939	79738	764677	72267904	31376000	103643904	81986010	35326191	117312202	88.15	88.82	88.35
2023-08-01	2023	8	71477988	11572149	83050137	691482	77137	768619	67933484	29938507	97871992	81997399	34908793	116906192	82.85	85.76	83.72
2023-09-01	2023	9	66858490	9392985	76251475	649308	64241	713549	61777546	26076318	87853864	75748336	31231710	106980046	81.56	83.49	82.12

📘 基于ARIMA+SARIMA的航空公司 RPM 时间序列预测模型/RPM.ipynb