In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
import tensorflow as tf
from keras import Model
from keras.layers import Input, Dense, Dropout, LSTM
from keras.callbacks import EarlyStopping
import warnings
warnings.filterwarnings('ignore')
# === 数据加载与预处理 ===
df_gold = pd.read_csv('Gold Price (2013-2022).csv')
df_gold['Date'] = pd.to_datetime(df_gold['Date'])
df_gold.sort_values(by='Date', ascending=True, inplace=True)
df_gold.reset_index(drop=True, inplace=True)
df_exchange = pd.read_csv('Daily_Gold_Price_on_World.csv')
df_exchange['Date'] = pd.to_datetime(df_exchange['Date'])
df_exchange.sort_values(by='Date', ascending=True, inplace=True)
df_exchange.reset_index(drop=True, inplace=True)
# 将汇率数据限制在黄金价格的日期范围内
df_exchange = df_exchange[df_exchange['Date'] >= df_gold['Date'].min()]
df_exchange = df_exchange[df_exchange['Date'] <= df_gold['Date'].max()]
# 合并黄金价格和汇率数据
df = pd.merge(df_gold, df_exchange, on='Date', how='inner')
# 检查数据是否有缺失值
print(df.isnull().sum())
# 删除任何有缺失值的行(如果有的话)
df.dropna(subset=['Price', 'USD', 'EUR'], inplace=True)
# 确保数据长度一致
print(f"Length of merged dataframe: {len(df)}")
print(f"Length of Date column: {len(df['Date'])}")
# 处理带有单位的列数据 (例如:'0.06K' -> 60, '1M' -> 1000000)
def clean_data(value):
if isinstance(value, str):
# Check for percentage values and convert to decimal
if '%' in value:
return float(value.replace('%', '').strip()) / 100
# Check for 'K', 'M', or 'B' and convert to numerical values
elif 'K' in value:
return float(value.replace('K', '').strip()) * 1000
elif 'M' in value:
return float(value.replace('M', '').strip()) * 1000000
elif 'B' in value:
return float(value.replace('B', '').strip()) * 1000000000
else:
return float(value.replace(',', '').strip()) # Clean up commas for numerical conversion
return value
# 清理所有列数据,应用到每个列
for col in df.columns:
df[col] = df[col].apply(clean_data)
# === 可视化数据 ===
fig = px.line(df, x='Date', y=['Price', 'USD', 'EUR'], title="Gold Price and Exchange Rates")
fig.show()
# === 数据切分 ===
test_size = df[df.Date.dt.year == 2022].shape[0]
scaler = MinMaxScaler()
scaler.fit(df.drop(columns=['Date']).values)
# 对数据进行缩放
train_data = scaler.transform(df.drop(columns=['Date']).iloc[:-test_size].values)
test_data = scaler.transform(df.drop(columns=['Date']).iloc[-test_size - 60:].values)
# 滑动窗口构造
window_size = 60
X_train, y_train, X_test, y_test = [], [], [], []
for i in range(window_size, len(train_data)):
X_train.append(train_data[i - window_size:i, :])
y_train.append(train_data[i, 0]) # 黄金价格作为目标变量
for i in range(window_size, len(test_data)):
X_test.append(test_data[i - window_size:i, :])
y_test.append(test_data[i, 0])
X_train, y_train = np.array(X_train), np.array(y_train)
X_test, y_test = np.array(X_test), np.array(y_test)
# 调整形状
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], X_train.shape[2]))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], X_test.shape[2]))
y_train = y_train.reshape((-1, 1))
y_test = y_test.reshape((-1, 1))
# === 模型构建 ===
def define_model():
input1 = Input(shape=(window_size, X_train.shape[2])) # X_train.shape[2] 是特征数量
x = LSTM(units=64, return_sequences=True)(input1)
x = Dropout(0.2)(x)
x = LSTM(units=32)(x)
x = Dropout(0.2)(x)
dnn_output = Dense(1)(x) # 输出黄金价格的预测值
model = Model(inputs=input1, outputs=[dnn_output])
model.compile(loss='mean_squared_error', optimizer='Adam') # 使用适合回归的Adam优化器
return model
model = define_model()
# 训练模型
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2,
callbacks=[early_stopping], verbose=1)
# === 模型评估 ===
y_pred = model.predict(X_test)
y_test_true = scaler.inverse_transform(np.hstack((y_test, np.zeros((y_test.shape[0], df.shape[1] - 1)))))
y_pred_rescaled = scaler.inverse_transform(np.hstack((y_pred, np.zeros((y_pred.shape[0], df.shape[1] - 1)))))
plt.figure(figsize=(15, 6))
plt.plot(df['Date'].iloc[:-test_size], scaler.inverse_transform(train_data)[:, 0], color='black', lw=2)
plt.plot(df['Date'].iloc[-test_size:], y_test_true[:, 0], color='blue', lw=2)
plt.plot(df['Date'].iloc[-test_size:], y_pred_rescaled[:, 0], color='red', lw=2)
plt.title('Model Performance on Gold Price Prediction with Exchange Rates', fontsize=15)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Gold Price', fontsize=12)
plt.legend(['Training Data', 'Actual Test Data', 'Predicted Test Data'], loc='upper left', prop={'size': 15})
plt.grid(color='white')
plt.show()
# 评估模型
MAPE = mean_absolute_percentage_error(y_test, y_pred)
MSE = mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(MSE)
print(f"MAPE: {MAPE * 100:.2f}%")
print(f"RMSE: {RMSE:.2f}")
MAE = np.mean(np.abs(y_test - y_pred))
print(f"MAE: {MAE:.2f}")
from sklearn.metrics import r2_score
R2 = r2_score(y_test, y_pred)
print(f"R² Score: {R2:.2f}")
Date 0 Price 0 Open 0 High 0 Low 0 Vol. 1 Change % 0 USD 0 EUR 0 JPY 0 GBP 0 CAD 0 CHF 0 INR 0 Chinese renmimbi (CNY) 0 Turkish lira (TRY) 0 Saudi riyal (SAR) 0 Indonesian rupiah (IDR) 0 UAE dirham (AED) 0 Thai baht THB) 0 Vietnamese dong (VND) 0 Egyptian pound (EGP) 0 Korean won (KRW) 0 Russian ruble (RUB) 0 South African rand (ZAR) 0 Australian dollar (AUD) 0 dtype: int64 Length of merged dataframe: 2393 Length of Date column: 2393
Epoch 1/100 57/57 [==============================] - 4s 33ms/step - loss: nan - val_loss: nan Epoch 2/100 57/57 [==============================] - 1s 25ms/step - loss: nan - val_loss: nan Epoch 3/100 57/57 [==============================] - 1s 24ms/step - loss: nan - val_loss: nan Epoch 4/100 57/57 [==============================] - 1s 24ms/step - loss: nan - val_loss: nan Epoch 5/100 57/57 [==============================] - 2s 28ms/step - loss: nan - val_loss: nan Epoch 6/100 57/57 [==============================] - 2s 27ms/step - loss: nan - val_loss: nan Epoch 7/100 57/57 [==============================] - 2s 27ms/step - loss: nan - val_loss: nan Epoch 8/100 57/57 [==============================] - 2s 27ms/step - loss: nan - val_loss: nan Epoch 9/100 57/57 [==============================] - 2s 27ms/step - loss: nan - val_loss: nan Epoch 10/100 57/57 [==============================] - 2s 27ms/step - loss: nan - val_loss: nan 3/3 [==============================] - 1s 5ms/step
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[11], line 123 120 # === 模型评估 === 121 y_pred = model.predict(X_test) --> 123 y_test_true = scaler.inverse_transform(np.hstack((y_test, np.zeros((y_test.shape[0], df.shape[1] - 1))))) 124 y_pred_rescaled = scaler.inverse_transform(np.hstack((y_pred, np.zeros((y_pred.shape[0], df.shape[1] - 1))))) 126 # Plotting the results File d:\Python310\lib\site-packages\sklearn\preprocessing\_data.py:548, in MinMaxScaler.inverse_transform(self, X) 542 check_is_fitted(self) 544 X = check_array( 545 X, copy=self.copy, dtype=FLOAT_DTYPES, force_all_finite="allow-nan" 546 ) --> 548 X -= self.min_ 549 X /= self.scale_ 550 return X ValueError: operands could not be broadcast together with shapes (70,26) (25,) (70,26)