In [ ]:
In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score, explained_variance_score
from keras import Model
from keras.layers import Input, Dense, Dropout, LSTM
from keras.callbacks import EarlyStopping
import warnings
warnings.filterwarnings('ignore')
# === 数据收集与预处理 ===
df = pd.read_csv('Gold_Data.csv')
# 1. 数据清洗:去除无关列(如交易量和百分比变化),确保数据质量
# df.drop(['Volume'], axis=1, inplace=True) # 如果你不需要成交量数据,去除该列
# 2. 日期格式转换与排序
df['Date'] = pd.to_datetime(df['Date'])
df.sort_values(by='Date', ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True)
# 3. 去除“,”符号并转换为float类型
NumCols = df.columns.drop(['Date'])
df[NumCols] = df[NumCols].replace({',': ''}, regex=True).astype('float64')
# 4. 缺失值检查
if df.isnull().sum().sum() > 0:
print("数据中存在缺失值!请检查。")
else:
print("数据无缺失值。")
# === 数据波动规律分析 ===
# 使用描述性统计方法分析黄金价格波动特性
print(f"黄金价格数据的均值:\n{df['Adj_Close'].mean()}")
print(f"黄金价格数据的标准差:\n{df['Adj_Close'].std()}")
print(f"黄金价格数据的变异系数:\n{df['Adj_Close'].std() / df['Adj_Close'].mean()}")
# 自相关函数与偏自相关函数分析 (ACF 和 PACF)
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plot_acf(df['Adj_Close'], lags=50)
plt.title('自相关函数 (ACF) 分析')
plt.show()
plot_pacf(df['Adj_Close'], lags=50)
plt.title('偏自相关函数 (PACF) 分析')
plt.show()
# === 可视化黄金价格 ===
fig = px.line(y=df['Adj_Close'], x=df['Date'], title="Gold Price History Data")
fig.update_traces(line_color='black')
fig.update_layout(plot_bgcolor='rgba(255,223,0,0.8)', xaxis_title="Date", yaxis_title="Scaled Price")
fig.show()
# === 数据预处理:标准化与归一化 ===
scaler = MinMaxScaler()
scaler.fit(df['Adj_Close'].values.reshape(-1, 1))
# 训练数据与测试数据切分,测试数据包含2022年的数据
test_size = df[df['Date'].dt.year == 2022].shape[0]
train_data = scaler.transform(df['Adj_Close'][:-test_size].values.reshape(-1, 1))
test_data = scaler.transform(df['Adj_Close'][-test_size - 60:].values.reshape(-1, 1))
# 滑动窗口构造特征和标签
window_size = 60
X_train, y_train, X_test, y_test = [], [], [], []
# 训练数据
for i in range(window_size, len(train_data)):
X_train.append(train_data[i - window_size:i, 0])
y_train.append(train_data[i, 0])
# 测试数据
for i in range(window_size, len(test_data)):
X_test.append(test_data[i - window_size:i, 0])
y_test.append(test_data[i, 0])
X_train, y_train = np.array(X_train), np.array(y_train)
X_test, y_test = np.array(X_test), np.array(y_test)
# 调整数据形状以适应LSTM输入
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
y_train = y_train.reshape((-1, 1))
y_test = y_test.reshape((-1, 1))
# === 基于LSTM的黄金价格预测 ===
def define_model():
input1 = Input(shape=(window_size, 1))
x = LSTM(units=64, return_sequences=True)(input1)
x = Dropout(0.2)(x)
x = LSTM(units=32)(x) # 使用较少的LSTM单元以减少过拟合
x = Dropout(0.2)(x)
dnn_output = Dense(1)(x) # 输出层为线性激活函数
model = Model(inputs=input1, outputs=[dnn_output])
model.compile(loss='mean_squared_error', optimizer='Adam') # 使用Adam优化器进行回归任务
return model
model = define_model()
# 训练模型
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2,
callbacks=[early_stopping], verbose=1)
# === 可视化结果 ===
y_pred = model.predict(X_test)
y_test_true = scaler.inverse_transform(y_test)
y_test_pred = scaler.inverse_transform(y_pred)
# 训练数据与预测结果可视化
plt.figure(figsize=(15, 6))
plt.plot(df['Date'].iloc[:-test_size], scaler.inverse_transform(train_data), color='black', lw=2)
plt.plot(df['Date'].iloc[-test_size:], y_test_true, color='blue', lw=2)
plt.plot(df['Date'].iloc[-test_size:], y_test_pred, color='red', lw=2)
plt.title('Model Performance on Gold Price Prediction', fontsize=15)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Price', fontsize=12)
plt.legend(['Training Data', 'Actual Test Data', 'Predicted Test Data'], loc='upper left', prop={'size': 15})
plt.grid(color='white')
plt.show()
# === 结果评价 ===
MAPE = mean_absolute_percentage_error(y_test, y_pred)
MSE = mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(MSE)
MAE = np.mean(np.abs(y_test - y_pred))
R2 = r2_score(y_test, y_pred)
SMAPE = np.mean(2 * np.abs(y_test - y_pred) / (np.abs(y_test) + np.abs(y_pred))) * 100
EVS = explained_variance_score(y_test, y_pred)
print(f"MAPE: {MAPE * 100:.2f}%")
print(f"RMSE: {RMSE:.2f}")
print(f"MAE: {MAE:.2f}")
print(f"R² Score: {R2:.2f}")
print(f"SMAPE: {SMAPE:.2f}%")
print(f"Explained Variance Score: {EVS:.2f}")
# 残差分布
residuals = y_test - y_pred
plt.figure(figsize=(10, 5))
plt.hist(residuals, bins=50, color='purple', alpha=0.7)
plt.title('Residual Distribution')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()
数据无缺失值。 黄金价格数据的均值: 1150.7248180549143 黄金价格数据的标准差: 569.1439853180088 黄金价格数据的变异系数: 0.4945960809987922
Epoch 1/100 144/144 [==============================] - 18s 76ms/step - loss: 0.0046 - val_loss: 3.7204e-04 Epoch 2/100 144/144 [==============================] - 5s 37ms/step - loss: 0.0013 - val_loss: 0.0012 Epoch 3/100 144/144 [==============================] - 5s 37ms/step - loss: 0.0011 - val_loss: 3.1841e-04 Epoch 4/100 144/144 [==============================] - 5s 37ms/step - loss: 0.0010 - val_loss: 4.8822e-04 Epoch 5/100 144/144 [==============================] - 5s 37ms/step - loss: 8.8024e-04 - val_loss: 2.0068e-04 Epoch 6/100 144/144 [==============================] - 6s 41ms/step - loss: 8.4688e-04 - val_loss: 2.4858e-04 Epoch 7/100 144/144 [==============================] - 5s 37ms/step - loss: 7.6652e-04 - val_loss: 2.4751e-04 Epoch 8/100 144/144 [==============================] - 5s 35ms/step - loss: 7.3120e-04 - val_loss: 4.3774e-04 Epoch 9/100 144/144 [==============================] - 5s 37ms/step - loss: 6.6445e-04 - val_loss: 1.8103e-04 Epoch 10/100 144/144 [==============================] - 5s 36ms/step - loss: 5.8101e-04 - val_loss: 2.1313e-04 Epoch 11/100 144/144 [==============================] - 5s 35ms/step - loss: 5.5550e-04 - val_loss: 2.4173e-04 Epoch 12/100 144/144 [==============================] - 12s 82ms/step - loss: 5.2842e-04 - val_loss: 1.6630e-04 Epoch 13/100 144/144 [==============================] - 12s 84ms/step - loss: 4.7654e-04 - val_loss: 2.7373e-04 Epoch 14/100 144/144 [==============================] - 11s 76ms/step - loss: 4.7484e-04 - val_loss: 4.3786e-04 Epoch 15/100 144/144 [==============================] - 5s 34ms/step - loss: 4.5052e-04 - val_loss: 1.7527e-04 Epoch 16/100 144/144 [==============================] - 5s 34ms/step - loss: 4.2153e-04 - val_loss: 6.3881e-04 Epoch 17/100 144/144 [==============================] - 5s 34ms/step - loss: 4.1771e-04 - val_loss: 1.9830e-04 Epoch 18/100 144/144 [==============================] - 5s 33ms/step - loss: 4.2623e-04 - val_loss: 5.7228e-04 Epoch 19/100 144/144 [==============================] - 5s 33ms/step - loss: 4.1016e-04 - val_loss: 1.9803e-04 Epoch 20/100 144/144 [==============================] - 5s 34ms/step - loss: 3.7692e-04 - val_loss: 2.8257e-04 Epoch 21/100 144/144 [==============================] - 5s 34ms/step - loss: 4.1918e-04 - val_loss: 2.9835e-04 Epoch 22/100 144/144 [==============================] - 5s 35ms/step - loss: 3.7815e-04 - val_loss: 3.3123e-04 8/8 [==============================] - 1s 12ms/step
MAPE: 1.89% RMSE: 0.02 MAE: 0.02 R² Score: 0.95 SMAPE: 1.91% Explained Variance Score: 0.97
In [3]:
# === 使用 Plotly 绘制训练数据与预测结果 ===
# 准备数据
train_dates = df['Date'].iloc[:-test_size]
test_dates = df['Date'].iloc[-test_size:]
# 绘制训练数据与测试数据(实际值和预测值)
fig = px.line(title="Model Performance on Gold Price Prediction")
fig.add_scatter(x=train_dates, y=scaler.inverse_transform(train_data).ravel(),
mode='lines', name='Training Data', line=dict(color='black', width=2))
fig.add_scatter(x=test_dates, y=y_test_true.ravel(),
mode='lines', name='Actual Test Data', line=dict(color='blue', width=2))
fig.add_scatter(x=test_dates, y=y_test_pred.ravel(),
mode='lines', name='Predicted Test Data', line=dict(color='red', width=2))
# 更新布局
fig.update_layout(
xaxis_title="Date",
yaxis_title="Price",
plot_bgcolor="white",
font=dict(size=12),
legend=dict(title="Legend", font=dict(size=12)),
title_font=dict(size=16)
)
fig.update_xaxes(showgrid=True, gridcolor='lightgray')
fig.update_yaxes(showgrid=True, gridcolor='lightgray')
# 显示图表
fig.show()
# === 使用 Plotly 绘制残差分布 ===
residuals = y_test - y_pred
fig_residuals = px.histogram(
residuals.ravel(),
nbins=50,
title="Residual Distribution",
labels={"value": "Residuals", "count": "Frequency"},
color_discrete_sequence=["purple"]
)
# 更新布局
fig_residuals.update_layout(
xaxis_title="Residuals",
yaxis_title="Frequency",
plot_bgcolor="white",
font=dict(size=12),
title_font=dict(size=16)
)
fig_residuals.update_xaxes(showgrid=True, gridcolor='lightgray')
fig_residuals.update_yaxes(showgrid=True, gridcolor='lightgray')
# 显示图表
fig_residuals.show()
In [4]:
from statsmodels.tsa.stattools import acf, pacf
# === 计算 ACF 和 PACF ===
lags = 50 # 设置滞后阶数
acf_values = acf(df['Adj_Close'], nlags=lags, fft=True)
pacf_values = pacf(df['Adj_Close'], nlags=lags, method='ywm')
# === 使用 Plotly 绘制 ACF 图 ===
fig_acf = px.bar(
x=list(range(lags + 1)),
y=acf_values,
title="Autocorrelation Function (ACF)",
labels={"x": "Lag", "y": "Autocorrelation"},
color_discrete_sequence=["blue"]
)
fig_acf.update_layout(
xaxis=dict(title="Lag", showgrid=True, gridcolor='lightgray'),
yaxis=dict(title="Autocorrelation", showgrid=True, gridcolor='lightgray'),
plot_bgcolor="white",
font=dict(size=12),
title_font=dict(size=16)
)
# 添加水平线(95%置信区间)
conf_level = 1.96 / np.sqrt(len(df['Adj_Close']))
fig_acf.add_shape(type="line", x0=0, x1=lags, y0=conf_level, y1=conf_level,
line=dict(color="red", dash="dash"))
fig_acf.add_shape(type="line", x0=0, x1=lags, y0=-conf_level, y1=-conf_level,
line=dict(color="red", dash="dash"))
fig_acf.show()
# === 使用 Plotly 绘制 PACF 图 ===
fig_pacf = px.bar(
x=list(range(lags + 1)),
y=pacf_values,
title="Partial Autocorrelation Function (PACF)",
labels={"x": "Lag", "y": "Partial Autocorrelation"},
color_discrete_sequence=["green"]
)
fig_pacf.update_layout(
xaxis=dict(title="Lag", showgrid=True, gridcolor='lightgray'),
yaxis=dict(title="Partial Autocorrelation", showgrid=True, gridcolor='lightgray'),
plot_bgcolor="white",
font=dict(size=12),
title_font=dict(size=16)
)
# 添加水平线(95%置信区间)
fig_pacf.add_shape(type="line", x0=0, x1=lags, y0=conf_level, y1=conf_level,
line=dict(color="red", dash="dash"))
fig_pacf.add_shape(type="line", x0=0, x1=lags, y0=-conf_level, y1=-conf_level,
line=dict(color="red", dash="dash"))
fig_pacf.show()
In [ ]: