import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score, explained_variance_score
from keras import Model
from keras.layers import Input, Dense, Dropout, LSTM
from keras.callbacks import EarlyStopping
import warnings

warnings.filterwarnings('ignore')

# === 数据收集与预处理 ===
df = pd.read_csv('Gold_Data.csv')

# 1. 数据清洗：去除无关列（如交易量和百分比变化），确保数据质量
# df.drop(['Volume'], axis=1, inplace=True)  # 如果你不需要成交量数据，去除该列

# 2. 日期格式转换与排序
df['Date'] = pd.to_datetime(df['Date'])
df.sort_values(by='Date', ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True)

# 3. 去除“,”符号并转换为float类型
NumCols = df.columns.drop(['Date'])
df[NumCols] = df[NumCols].replace({',': ''}, regex=True).astype('float64')

# 4. 缺失值检查
if df.isnull().sum().sum() > 0:
    print("数据中存在缺失值！请检查。")
else:
    print("数据无缺失值。")

# === 数据波动规律分析 ===
# 使用描述性统计方法分析黄金价格波动特性
print(f"黄金价格数据的均值:\n{df['Adj_Close'].mean()}")
print(f"黄金价格数据的标准差:\n{df['Adj_Close'].std()}")
print(f"黄金价格数据的变异系数:\n{df['Adj_Close'].std() / df['Adj_Close'].mean()}")

# 自相关函数与偏自相关函数分析 (ACF 和 PACF)
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plot_acf(df['Adj_Close'], lags=50)
plt.title('自相关函数 (ACF) 分析')
plt.show()

plot_pacf(df['Adj_Close'], lags=50)
plt.title('偏自相关函数 (PACF) 分析')
plt.show()

# === 可视化黄金价格 ===
fig = px.line(y=df['Adj_Close'], x=df['Date'], title="Gold Price History Data")
fig.update_traces(line_color='black')
fig.update_layout(plot_bgcolor='rgba(255,223,0,0.8)', xaxis_title="Date", yaxis_title="Scaled Price")
fig.show()

# === 数据预处理：标准化与归一化 ===
scaler = MinMaxScaler()
scaler.fit(df['Adj_Close'].values.reshape(-1, 1))

# 训练数据与测试数据切分，测试数据包含2022年的数据
test_size = df[df['Date'].dt.year == 2022].shape[0]
train_data = scaler.transform(df['Adj_Close'][:-test_size].values.reshape(-1, 1))
test_data = scaler.transform(df['Adj_Close'][-test_size - 60:].values.reshape(-1, 1))

# 滑动窗口构造特征和标签
window_size = 60
X_train, y_train, X_test, y_test = [], [], [], []

# 训练数据
for i in range(window_size, len(train_data)):
    X_train.append(train_data[i - window_size:i, 0])
    y_train.append(train_data[i, 0])

# 测试数据
for i in range(window_size, len(test_data)):
    X_test.append(test_data[i - window_size:i, 0])
    y_test.append(test_data[i, 0])

X_train, y_train = np.array(X_train), np.array(y_train)
X_test, y_test = np.array(X_test), np.array(y_test)

# 调整数据形状以适应LSTM输入
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
y_train = y_train.reshape((-1, 1))
y_test = y_test.reshape((-1, 1))

# === 基于LSTM的黄金价格预测 ===
def define_model():
    input1 = Input(shape=(window_size, 1))
    x = LSTM(units=64, return_sequences=True)(input1)
    x = Dropout(0.2)(x)
    x = LSTM(units=32)(x)  # 使用较少的LSTM单元以减少过拟合
    x = Dropout(0.2)(x)
    dnn_output = Dense(1)(x)  # 输出层为线性激活函数
    
    model = Model(inputs=input1, outputs=[dnn_output])
    model.compile(loss='mean_squared_error', optimizer='Adam')  # 使用Adam优化器进行回归任务
    return model

model = define_model()

# 训练模型
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, 
                    callbacks=[early_stopping], verbose=1)

# === 可视化结果 ===
y_pred = model.predict(X_test)
y_test_true = scaler.inverse_transform(y_test)
y_test_pred = scaler.inverse_transform(y_pred)

# 训练数据与预测结果可视化
plt.figure(figsize=(15, 6))
plt.plot(df['Date'].iloc[:-test_size], scaler.inverse_transform(train_data), color='black', lw=2)
plt.plot(df['Date'].iloc[-test_size:], y_test_true, color='blue', lw=2)
plt.plot(df['Date'].iloc[-test_size:], y_test_pred, color='red', lw=2)
plt.title('Model Performance on Gold Price Prediction', fontsize=15)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Price', fontsize=12)
plt.legend(['Training Data', 'Actual Test Data', 'Predicted Test Data'], loc='upper left', prop={'size': 15})
plt.grid(color='white')
plt.show()

# === 结果评价 ===
MAPE = mean_absolute_percentage_error(y_test, y_pred)
MSE = mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(MSE)
MAE = np.mean(np.abs(y_test - y_pred))
R2 = r2_score(y_test, y_pred)
SMAPE = np.mean(2 * np.abs(y_test - y_pred) / (np.abs(y_test) + np.abs(y_pred))) * 100
EVS = explained_variance_score(y_test, y_pred)

print(f"MAPE: {MAPE * 100:.2f}%")
print(f"RMSE: {RMSE:.2f}")
print(f"MAE: {MAE:.2f}")
print(f"R² Score: {R2:.2f}")
print(f"SMAPE: {SMAPE:.2f}%")
print(f"Explained Variance Score: {EVS:.2f}")

# 残差分布
residuals = y_test - y_pred
plt.figure(figsize=(10, 5))
plt.hist(residuals, bins=50, color='purple', alpha=0.7)
plt.title('Residual Distribution')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()

数据无缺失值。
黄金价格数据的均值:
1150.7248180549143
黄金价格数据的标准差:
569.1439853180088
黄金价格数据的变异系数:
0.4945960809987922

Epoch 1/100
144/144 [==============================] - 18s 76ms/step - loss: 0.0046 - val_loss: 3.7204e-04
Epoch 2/100
144/144 [==============================] - 5s 37ms/step - loss: 0.0013 - val_loss: 0.0012
Epoch 3/100
144/144 [==============================] - 5s 37ms/step - loss: 0.0011 - val_loss: 3.1841e-04
Epoch 4/100
144/144 [==============================] - 5s 37ms/step - loss: 0.0010 - val_loss: 4.8822e-04
Epoch 5/100
144/144 [==============================] - 5s 37ms/step - loss: 8.8024e-04 - val_loss: 2.0068e-04
Epoch 6/100
144/144 [==============================] - 6s 41ms/step - loss: 8.4688e-04 - val_loss: 2.4858e-04
Epoch 7/100
144/144 [==============================] - 5s 37ms/step - loss: 7.6652e-04 - val_loss: 2.4751e-04
Epoch 8/100
144/144 [==============================] - 5s 35ms/step - loss: 7.3120e-04 - val_loss: 4.3774e-04
Epoch 9/100
144/144 [==============================] - 5s 37ms/step - loss: 6.6445e-04 - val_loss: 1.8103e-04
Epoch 10/100
144/144 [==============================] - 5s 36ms/step - loss: 5.8101e-04 - val_loss: 2.1313e-04
Epoch 11/100
144/144 [==============================] - 5s 35ms/step - loss: 5.5550e-04 - val_loss: 2.4173e-04
Epoch 12/100
144/144 [==============================] - 12s 82ms/step - loss: 5.2842e-04 - val_loss: 1.6630e-04
Epoch 13/100
144/144 [==============================] - 12s 84ms/step - loss: 4.7654e-04 - val_loss: 2.7373e-04
Epoch 14/100
144/144 [==============================] - 11s 76ms/step - loss: 4.7484e-04 - val_loss: 4.3786e-04
Epoch 15/100
144/144 [==============================] - 5s 34ms/step - loss: 4.5052e-04 - val_loss: 1.7527e-04
Epoch 16/100
144/144 [==============================] - 5s 34ms/step - loss: 4.2153e-04 - val_loss: 6.3881e-04
Epoch 17/100
144/144 [==============================] - 5s 34ms/step - loss: 4.1771e-04 - val_loss: 1.9830e-04
Epoch 18/100
144/144 [==============================] - 5s 33ms/step - loss: 4.2623e-04 - val_loss: 5.7228e-04
Epoch 19/100
144/144 [==============================] - 5s 33ms/step - loss: 4.1016e-04 - val_loss: 1.9803e-04
Epoch 20/100
144/144 [==============================] - 5s 34ms/step - loss: 3.7692e-04 - val_loss: 2.8257e-04
Epoch 21/100
144/144 [==============================] - 5s 34ms/step - loss: 4.1918e-04 - val_loss: 2.9835e-04
Epoch 22/100
144/144 [==============================] - 5s 35ms/step - loss: 3.7815e-04 - val_loss: 3.3123e-04
8/8 [==============================] - 1s 12ms/step

MAPE: 1.89%
RMSE: 0.02
MAE: 0.02
R² Score: 0.95
SMAPE: 1.91%
Explained Variance Score: 0.97

# === 使用 Plotly 绘制训练数据与预测结果 ===
# 准备数据
train_dates = df['Date'].iloc[:-test_size]
test_dates = df['Date'].iloc[-test_size:]

# 绘制训练数据与测试数据（实际值和预测值）
fig = px.line(title="Model Performance on Gold Price Prediction")
fig.add_scatter(x=train_dates, y=scaler.inverse_transform(train_data).ravel(), 
                mode='lines', name='Training Data', line=dict(color='black', width=2))
fig.add_scatter(x=test_dates, y=y_test_true.ravel(), 
                mode='lines', name='Actual Test Data', line=dict(color='blue', width=2))
fig.add_scatter(x=test_dates, y=y_test_pred.ravel(), 
                mode='lines', name='Predicted Test Data', line=dict(color='red', width=2))

# 更新布局
fig.update_layout(
    xaxis_title="Date",
    yaxis_title="Price",
    plot_bgcolor="white",
    font=dict(size=12),
    legend=dict(title="Legend", font=dict(size=12)),
    title_font=dict(size=16)
)
fig.update_xaxes(showgrid=True, gridcolor='lightgray')
fig.update_yaxes(showgrid=True, gridcolor='lightgray')

# 显示图表
fig.show()

# === 使用 Plotly 绘制残差分布 ===
residuals = y_test - y_pred
fig_residuals = px.histogram(
    residuals.ravel(), 
    nbins=50, 
    title="Residual Distribution",
    labels={"value": "Residuals", "count": "Frequency"},
    color_discrete_sequence=["purple"]
)

# 更新布局
fig_residuals.update_layout(
    xaxis_title="Residuals",
    yaxis_title="Frequency",
    plot_bgcolor="white",
    font=dict(size=12),
    title_font=dict(size=16)
)
fig_residuals.update_xaxes(showgrid=True, gridcolor='lightgray')
fig_residuals.update_yaxes(showgrid=True, gridcolor='lightgray')

# 显示图表
fig_residuals.show()

from statsmodels.tsa.stattools import acf, pacf

# === 计算 ACF 和 PACF ===
lags = 50  # 设置滞后阶数
acf_values = acf(df['Adj_Close'], nlags=lags, fft=True)
pacf_values = pacf(df['Adj_Close'], nlags=lags, method='ywm')

# === 使用 Plotly 绘制 ACF 图 ===
fig_acf = px.bar(
    x=list(range(lags + 1)), 
    y=acf_values, 
    title="Autocorrelation Function (ACF)",
    labels={"x": "Lag", "y": "Autocorrelation"},
    color_discrete_sequence=["blue"]
)

fig_acf.update_layout(
    xaxis=dict(title="Lag", showgrid=True, gridcolor='lightgray'),
    yaxis=dict(title="Autocorrelation", showgrid=True, gridcolor='lightgray'),
    plot_bgcolor="white",
    font=dict(size=12),
    title_font=dict(size=16)
)

# 添加水平线（95%置信区间）
conf_level = 1.96 / np.sqrt(len(df['Adj_Close']))
fig_acf.add_shape(type="line", x0=0, x1=lags, y0=conf_level, y1=conf_level,
                  line=dict(color="red", dash="dash"))
fig_acf.add_shape(type="line", x0=0, x1=lags, y0=-conf_level, y1=-conf_level,
                  line=dict(color="red", dash="dash"))

fig_acf.show()

# === 使用 Plotly 绘制 PACF 图 ===
fig_pacf = px.bar(
    x=list(range(lags + 1)), 
    y=pacf_values, 
    title="Partial Autocorrelation Function (PACF)",
    labels={"x": "Lag", "y": "Partial Autocorrelation"},
    color_discrete_sequence=["green"]
)

fig_pacf.update_layout(
    xaxis=dict(title="Lag", showgrid=True, gridcolor='lightgray'),
    yaxis=dict(title="Partial Autocorrelation", showgrid=True, gridcolor='lightgray'),
    plot_bgcolor="white",
    font=dict(size=12),
    title_font=dict(size=16)
)

# 添加水平线（95%置信区间）
fig_pacf.add_shape(type="line", x0=0, x1=lags, y0=conf_level, y1=conf_level,
                   line=dict(color="red", dash="dash"))
fig_pacf.add_shape(type="line", x0=0, x1=lags, y0=-conf_level, y1=-conf_level,
                   line=dict(color="red", dash="dash"))

fig_pacf.show()

📘 基于长短期记忆网络（LSTM）的黄金价格预测模型/Gold_Data.ipynb