In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings,random,optuna
import plotly.express as px
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score,auc,roc_curve
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
plt.style.use('dark_background')
warnings.simplefilter('ignore', category=FutureWarning)
ds = pd.read_csv('AIDS_Classification.csv')
ds.head()
ds.shape
ds.info()
ds.describe(percentiles=[0, .25, .30, .50, .75, .80, 1]).T.style.background_gradient(cmap = 'inferno')
ds.isnull().sum()
ds.duplicated().sum()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2139 entries, 0 to 2138 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 time 2139 non-null int64 1 trt 2139 non-null int64 2 age 2139 non-null int64 3 wtkg 2139 non-null float64 4 hemo 2139 non-null int64 5 homo 2139 non-null int64 6 drugs 2139 non-null int64 7 karnof 2139 non-null int64 8 oprior 2139 non-null int64 9 z30 2139 non-null int64 10 preanti 2139 non-null int64 11 race 2139 non-null int64 12 gender 2139 non-null int64 13 str2 2139 non-null int64 14 strat 2139 non-null int64 15 symptom 2139 non-null int64 16 treat 2139 non-null int64 17 offtrt 2139 non-null int64 18 cd40 2139 non-null int64 19 cd420 2139 non-null int64 20 cd80 2139 non-null int64 21 cd820 2139 non-null int64 22 infected 2139 non-null int64 dtypes: float64(1), int64(22) memory usage: 384.5 KB
Out[4]:
0
In [5]:
ds.describe(percentiles=[0, .25, .30, .50, .75, .80, 1]).T.style.background_gradient(cmap = 'inferno')
Out[5]:
| count | mean | std | min | 0% | 25% | 30% | 50% | 75% | 80% | 100% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| time | 2139.000000 | 879.098177 | 292.274324 | 14.000000 | 14.000000 | 727.000000 | 825.000000 | 997.000000 | 1091.000000 | 1105.000000 | 1231.000000 | 1231.000000 |
| trt | 2139.000000 | 1.520804 | 1.127890 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 2.000000 | 3.000000 | 3.000000 | 3.000000 | 3.000000 |
| age | 2139.000000 | 35.248247 | 8.709026 | 12.000000 | 12.000000 | 29.000000 | 30.000000 | 34.000000 | 40.000000 | 42.000000 | 70.000000 | 70.000000 |
| wtkg | 2139.000000 | 75.125311 | 13.263164 | 31.000000 | 31.000000 | 66.679200 | 68.200000 | 74.390400 | 82.555200 | 84.809280 | 159.939360 | 159.939360 |
| hemo | 2139.000000 | 0.084151 | 0.277680 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
| homo | 2139.000000 | 0.661057 | 0.473461 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| drugs | 2139.000000 | 0.131370 | 0.337883 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
| karnof | 2139.000000 | 95.446470 | 5.900985 | 70.000000 | 70.000000 | 90.000000 | 90.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 | 100.000000 |
| oprior | 2139.000000 | 0.021973 | 0.146629 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
| z30 | 2139.000000 | 0.550257 | 0.497584 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| preanti | 2139.000000 | 379.175783 | 468.657526 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 142.000000 | 739.500000 | 842.000000 | 2851.000000 | 2851.000000 |
| race | 2139.000000 | 0.288453 | 0.453149 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| gender | 2139.000000 | 0.827957 | 0.377506 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| str2 | 2139.000000 | 0.585788 | 0.492701 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| strat | 2139.000000 | 1.979897 | 0.899053 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 2.000000 | 3.000000 | 3.000000 | 3.000000 | 3.000000 |
| symptom | 2139.000000 | 0.172978 | 0.378317 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
| treat | 2139.000000 | 0.751286 | 0.432369 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| offtrt | 2139.000000 | 0.362786 | 0.480916 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| cd40 | 2139.000000 | 350.501169 | 118.573863 | 0.000000 | 0.000000 | 263.500000 | 280.000000 | 340.000000 | 423.000000 | 444.400000 | 1199.000000 | 1199.000000 |
| cd420 | 2139.000000 | 371.307153 | 144.634909 | 49.000000 | 49.000000 | 269.000000 | 285.000000 | 353.000000 | 460.000000 | 486.000000 | 1119.000000 | 1119.000000 |
| cd80 | 2139.000000 | 986.627396 | 480.197750 | 40.000000 | 40.000000 | 654.000000 | 700.000000 | 893.000000 | 1207.000000 | 1305.000000 | 5011.000000 | 5011.000000 |
| cd820 | 2139.000000 | 935.369799 | 444.976051 | 124.000000 | 124.000000 | 631.500000 | 678.000000 | 865.000000 | 1146.500000 | 1223.000000 | 6035.000000 | 6035.000000 |
| infected | 2139.000000 | 0.243572 | 0.429338 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
In [13]:
import matplotlib.font_manager as fm
# 列出所有可用字体
font_paths = fm.findSystemFonts()
for font_path in font_paths:
print(fm.FontProperties(fname=font_path).get_name())
Symbol Tiger Bookman Old Style Segoe Script Poor Richard Arial Segoe UI STCaiyun Calibri Bookman Old Style FangSong Constantia Rage Italic Baskerville Old Face Ebrima Tw Cen MT Condensed Trebuchet MS Microsoft PhagsPa Symbol Bauhaus 93 Microsoft YaHei Microsoft Tai Le Euclid Math One Calibri Century Schoolbook Dubai LiSu Segoe UI Colonna MT Tiger Expert Goudy Old Style Microsoft JhengHei STHupo Script MT Bold Corbel Euclid Extra Segoe UI Freestyle Script Bell MT Dubai Javanese Text Microsoft YaHei Rockwell Palace Script MT Times New Roman Sylfaen Nirmala UI Calibri High Tower Text Perpetua Cambria Century Gothic Century Schoolbook Yu Gothic STZhongsong Euclid Fraktur Franklin Gothic Medium Segoe UI Mistral Gigi Candara MingLiU-ExtB MS Outlook Courier New Perpetua Arial Lucida Console Microsoft Uighur Perpetua Lucida Fax Euclid Symbol Bodoni MT Segoe MDL2 Assets Elephant Californian FB Informal Roman Bernard MT Condensed Euclid Sans Serif Collection Tw Cen MT Condensed Extra Bold Franklin Gothic Book Berlin Sans FB STXingkai High Tower Text Calibri Lucida Bright Magneto Franklin Gothic Book Lucida Calligraphy Leelawadee Euclid Math Two Corbel DengXian Century Schoolbook Euclid Fraktur Onyx Bodoni MT Rockwell Castellar Wingdings Calisto MT MT Extra Goudy Old Style Kunstler Script RM Pro Gill Sans Ultra Bold Segoe UI Bodoni MT Edwardian Script ITC SimSun Ravie Verdana Euclid Consolas Microsoft JhengHei Lucida Sans Typewriter Bodoni MT Verdana Trebuchet MS Courier New Jokerman Segoe Script HoloLens MDL2 Assets Palatino Linotype Franklin Gothic Demi Cond Segoe UI Century Niagara Solid Gadugi Webdings Arial Tiger Copperplate Gothic Light Lucida Sans Typewriter Modern No. 20 Cooper Black Fences Rockwell Verdana Gill Sans MT Ext Condensed Bold Times New Roman Arial Agency FB Sitka Book Antiqua MV Boli Broadway MT Extra Tiger Garamond ZWAdobeF Tahoma Forte Lucida Bright Comic Sans MS Segoe UI Variable Gabriola Perpetua Titling MT Courier New Arial Palatino Linotype Calisto MT Gill Sans MT Microsoft New Tai Lue Heebo Lucida Bright Yu Gothic Segoe UI Mongolian Baiti Brush Script MT Felix Titling Yu Gothic Franklin Gothic Medium Cond MS Reference Sans Serif DengXian Georgia Leelawadee UI Segoe UI Historic Arial FZYaoTi Symbol Tiger Expert Imprint MT Shadow Gill Sans MT Microsoft Tai Le Lucida Sans Century Gothic Eras Demi ITC Calisto MT MS Reference Specialty Dubai Bahnschrift Rockwell Extra Bold Ink Free Nirmala UI Microsoft New Tai Lue Bookman Old Style OCR A Extended Vladimir Script Goudy Old Style Trebuchet MS Gill Sans MT Segoe UI IQYHT Euclid Symbol Microsoft Yi Baiti Cera PROModern Cascadia Code Calibri Berlin Sans FB Myanmar Text Arial Franklin Gothic Heavy Gill Sans MT Condensed Franklin Gothic Heavy SimHei Bodoni MT Rockwell Condensed Segoe UI Emoji Bookman Old Style Segoe Print Gill Sans MT Bradley Hand ITC Malgun Gothic Tw Cen MT Condensed Snap ITC French Script MT Book Antiqua ?????? Niagara Engraved STKaiti STXinwei Harlow Solid Italic ShouShuti Rockwell DengXian MS Gothic Georgia Vivaldi Gill Sans Ultra Bold Condensed Papyrus Euclid Math One Berlin Sans FB Demi Times New Roman Microsoft JhengHei Lucida Fax Lucida Sans Lucida Sans Typewriter Rockwell Condensed Corbel STSong Malgun Gothic Wingdings 2 Nirmala UI Century Schoolbook Kristen ITC STLiti Blackadder ITC Myanmar Text Arial Yu Gothic Times New Roman Bell MT Comic Sans MS Arial Rounded MT Bold Malgun Gothic Elephant Tw Cen MT Tw Cen MT Corbel Candara Bodoni MT Segoe UI Bodoni MT Californian FB STXihei Euclid Symbol Parchment Sitka IQYHT Consolas Bell MT Comic Sans MS Algerian Euclid Math Two Leelawadee UI Segoe UI Agency FB Lucida Bright Chiller Centaur Dubai Lucida Fax Segoe Print Tw Cen MT Courier New Haettenschweiler Bodoni MT Comic Sans MS Leelawadee UI Lucida Bright Trebuchet MS Stencil Lucida Handwriting Microsoft YaHei Franklin Gothic Demi Franklin Gothic Demi Corbel Lucida Sans Typewriter Calibri KaiTi Constantia Showcard Gothic MF QingShu (Noncommercial) Lucida Sans Unicode Impact Eras Medium ITC Euclid Extra Segoe UI FZShuTi Corbel Engravers MT Calisto MT Consolas Bodoni MT Perpetua Titling MT Leelawadee Lucida Sans Ebrima Curlz MT Consolas Bodoni MT Candara Matura MT Script Capitals Cambria Tahoma Constantia Microsoft Sans Serif Cambria Microsoft YaHei Playbill Verdana Eras Light ITC Segoe Fluent Icons YouYuan Gadugi Century Gothic SimSun-ExtG Wingdings 3 CXD6763_Newest Tempus Sans ITC Euclid Garamond Perpetua Juice ITC Microsoft Uighur Cambria Bodoni MT Microsoft PhagsPa Harrington Arial Viner Hand ITC DejaVu Math TeX Gyre Footlight MT Light Eras Bold ITC Gloucester MT Extra Condensed Maiandra GD Constantia SimSun-ExtB Book Antiqua Bookshelf Symbol 7 Palatino Linotype Goudy Stout Book Antiqua Century Gothic Cascadia Mono Segoe UI Euclid Symbol Candara Segoe UI Symbol Californian FB Candara Lucida Sans Copperplate Gothic Bold Euclid Georgia Microsoft Himalaya STFangsong Franklin Gothic Medium Old English Text MT Georgia Candara Britannic Bold Lucida Fax Tw Cen MT Garamond Monotype Corsiva Wide Latin Pristina Lucida Sans Palatino Linotype
In [14]:
import matplotlib.pyplot as plt
def mPlotter(r, c, size, _targets, text):
# 背景颜色
bg = '#010108'
# 颜色调色板
palette = ['#df5337', '#d24644', '#f7d340', '#3339FF', '#440a68', '#84206b', '#f1ef75', '#fbbe23', '#400a67']
# 字体设置
font = 'Arial'
# 创建图形
fig = plt.figure(figsize=size)
# 设置背景色
fig.patch.set_facecolor(bg)
# 创建网格布局
grid = fig.add_gridspec(r, c)
# 更新网格的间距
grid.update(wspace=0.5, hspace=0.25)
# 计算空白子图的数量
__empty_diff = ((r * c) - 1) - len(_targets)
# 存储所有的子图轴
axes = []
# 添加子图
for i in range(r):
for j in range(c):
axes.append(fig.add_subplot(grid[i, j]))
# 设置每个子图的样式
for idx, ax in enumerate(axes):
ax.set_facecolor(bg)
if idx == 0:
ax.spines["bottom"].set_visible(False)
ax.tick_params(left=False, bottom=False)
ax.set_xticklabels([])
ax.set_yticklabels([])
ax.text(0.5, 0.5,
f'{text}',
horizontalalignment='center',
verticalalignment='center',
fontsize=18,
fontweight='bold',
fontfamily=font,
color="#fff")
else:
if (idx - 1) < len(_targets):
ax.set_title(_targets[idx - 1].capitalize(), fontsize=14, fontweight='bold', fontfamily=font, color="#fff")
ax.grid(color='#fff', linestyle=':', axis='y', zorder=0, dashes=(1, 5))
ax.set_xlabel("")
ax.set_ylabel("")
else:
ax.spines["bottom"].set_visible(False)
ax.tick_params(left=False, bottom=False)
ax.set_xticklabels([])
ax.set_yticklabels([])
# 隐藏左右上下边框
ax.spines["left"].set_visible(False)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
# 空白子图的回调函数
def cb(ax):
ax.set_xlabel("")
ax.set_ylabel("")
# 如果有空白子图,删除它们
if __empty_diff > 0:
axes = axes[:-1 * __empty_diff]
return axes, palette, cb
In [15]:
target = 'infected'
cont_cols = ['time', 'age', 'wtkg', 'preanti', 'cd40', 'cd420', 'cd80', 'cd820']
dis_cols = list(set(ds.columns) - set([*cont_cols, target]))
len(cont_cols), len(dis_cols)
Out[15]:
(8, 14)
In [16]:
axes, palette, cb = mPlotter(1, 2, (20, 5), [target], 'Count Of\nInfected Variable\n______________')
sns.countplot(x=ds[target], ax = axes[1], color=palette[0])
cb(axes[1])
In [17]:
axes, palette, cb = mPlotter(3, 3, (20, 20), cont_cols, 'KDE Plot of\nContinuous Variables\n________________')
for col, ax in zip(cont_cols, axes[1:]):
sns.kdeplot(data=ds, x=col, ax=ax, hue=target, palette=palette[1:3], alpha=.5, linewidth=0, fill=True)
cb(ax)
In [18]:
axes, palette, cb = mPlotter(3, 3, (20, 20), cont_cols, 'Boxen Plot of\nContinuous Variables\n________________')
for col, ax in zip(cont_cols, axes[1:]):
sns.boxenplot(data=ds, y=col, ax=ax, palette=[palette[random.randint(0, len(palette)-1)]])
cb(ax)
In [19]:
axes, palette, cb = mPlotter(5, 3, (20, 20), dis_cols, 'Countplot of\nDiscrete Variables\n________________')
for col, ax in zip(dis_cols, axes[1:]):
sns.countplot(x=ds[col], ax = ax, hue=ds[target], palette=palette[6:8])
cb(ax)
In [20]:
ax = px.scatter_3d(ds, x="age", y="wtkg", z="time", template= "plotly_dark", color="infected")
ax.show()
In [21]:
ax = px.scatter_3d(ds, x="preanti", y="cd40", z="cd420", template= "plotly_dark", color="infected")
ax.show()
In [22]:
ax = px.scatter_3d(ds, x="preanti", y="cd80", z="cd820", template= "plotly_dark", color="infected")
ax.show()
In [23]:
fig = plt.figure(figsize=(25, 8))
gs = fig.add_gridspec(1, 1)
gs.update(wspace=0.3, hspace=0.15)
ax = fig.add_subplot(gs[0, 0])
ax.set_title("Correlation Matrix", fontsize=28, fontweight='bold', fontfamily='serif', color="#fff")
sns.heatmap(ds[cont_cols].corr().transpose(), mask=np.triu(np.ones_like(ds[cont_cols].corr().transpose())), fmt=".1f", annot=True, cmap='Blues')
plt.show()
In [24]:
# 拆分数据集为训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(ds.iloc[:,:-1], ds.iloc[:, -1], random_state=3, train_size=.7)
x_train.shape, y_train.shape, x_test.shape, y_test.shape
# 平衡数据集
smote = SMOTE(random_state = 14)
x_train, y_train = smote.fit_resample(x_train, y_train)
x_train.shape, y_train.shape, x_test.shape, y_test.shape
# 数据标准化处理
x_train = MinMaxScaler().fit_transform(x_train)
x_test = MinMaxScaler().fit_transform(x_test)
# 找到catboost的最佳超参数!
In [25]:
def objective(trial):
params = {
'iterations': trial.suggest_int('iterations', 100, 1000),
'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
'depth': trial.suggest_int('depth', 1, 12),
'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
'border_count': trial.suggest_int('border_count', 1, 255),
'thread_count': -1,
'loss_function': 'MultiClass',
'eval_metric': 'Accuracy',
'verbose': False
}
model = CatBoostClassifier(**params)
model.fit(x_train, y_train, eval_set=(x_test, y_test), verbose=False, early_stopping_rounds=20)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
return accuracy
In [26]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, show_progress_bar=True)
[I 2024-11-17 13:45:42,287] A new study created in memory with name: no-name-9777cd51-18fc-42c5-8234-3c3619597f93
0%| | 0/50 [00:00<?, ?it/s]
[I 2024-11-17 13:45:44,316] Trial 0 finished with value: 0.8925233644859814 and parameters: {'iterations': 265, 'learning_rate': 0.05570902174985178, 'depth': 11, 'l2_leaf_reg': 0.5509390357470626, 'border_count': 97}. Best is trial 0 with value: 0.8925233644859814.
[I 2024-11-17 13:45:45,403] Trial 1 finished with value: 0.8925233644859814 and parameters: {'iterations': 741, 'learning_rate': 0.11797000147686394, 'depth': 11, 'l2_leaf_reg': 0.12886359808838865, 'border_count': 167}. Best is trial 0 with value: 0.8925233644859814.
[I 2024-11-17 13:45:45,556] Trial 2 finished with value: 0.9049844236760125 and parameters: {'iterations': 243, 'learning_rate': 0.25490821410259124, 'depth': 5, 'l2_leaf_reg': 0.0030390970133002274, 'border_count': 58}. Best is trial 2 with value: 0.9049844236760125.
[I 2024-11-17 13:45:47,386] Trial 3 finished with value: 0.8940809968847352 and parameters: {'iterations': 929, 'learning_rate': 0.4807205614362448, 'depth': 11, 'l2_leaf_reg': 8.202643510405666, 'border_count': 116}. Best is trial 2 with value: 0.9049844236760125.
[I 2024-11-17 13:45:47,517] Trial 4 finished with value: 0.8847352024922118 and parameters: {'iterations': 117, 'learning_rate': 0.19832904070472618, 'depth': 1, 'l2_leaf_reg': 0.0807745923638388, 'border_count': 75}. Best is trial 2 with value: 0.9049844236760125.
[I 2024-11-17 13:45:48,523] Trial 5 finished with value: 0.8894080996884736 and parameters: {'iterations': 352, 'learning_rate': 0.2052416671014467, 'depth': 10, 'l2_leaf_reg': 0.06382738853803058, 'border_count': 189}. Best is trial 2 with value: 0.9049844236760125.
[I 2024-11-17 13:45:48,823] Trial 6 finished with value: 0.9034267912772586 and parameters: {'iterations': 879, 'learning_rate': 0.42200838147029396, 'depth': 7, 'l2_leaf_reg': 0.6801952787988413, 'border_count': 35}. Best is trial 2 with value: 0.9049844236760125.
[I 2024-11-17 13:45:49,284] Trial 7 finished with value: 0.9003115264797508 and parameters: {'iterations': 880, 'learning_rate': 0.060297257902191585, 'depth': 8, 'l2_leaf_reg': 0.009574135079021997, 'border_count': 113}. Best is trial 2 with value: 0.9049844236760125.
[I 2024-11-17 13:45:49,422] Trial 8 finished with value: 0.8800623052959502 and parameters: {'iterations': 240, 'learning_rate': 0.02295486811630032, 'depth': 7, 'l2_leaf_reg': 2.9440213224913436, 'border_count': 99}. Best is trial 2 with value: 0.9049844236760125.
[I 2024-11-17 13:45:49,534] Trial 9 finished with value: 0.8894080996884736 and parameters: {'iterations': 153, 'learning_rate': 0.054502659870367336, 'depth': 6, 'l2_leaf_reg': 3.7700877500606174, 'border_count': 78}. Best is trial 2 with value: 0.9049844236760125.
[I 2024-11-17 13:45:49,634] Trial 10 finished with value: 0.8271028037383178 and parameters: {'iterations': 484, 'learning_rate': 0.011045274294233584, 'depth': 3, 'l2_leaf_reg': 0.0017578417881208433, 'border_count': 236}. Best is trial 2 with value: 0.9049844236760125.
[I 2024-11-17 13:45:49,743] Trial 11 finished with value: 0.8862928348909658 and parameters: {'iterations': 665, 'learning_rate': 0.4981157373638572, 'depth': 5, 'l2_leaf_reg': 0.0010996309176048054, 'border_count': 2}. Best is trial 2 with value: 0.9049844236760125.
[I 2024-11-17 13:45:49,885] Trial 12 finished with value: 0.9003115264797508 and parameters: {'iterations': 466, 'learning_rate': 0.2425926123313047, 'depth': 4, 'l2_leaf_reg': 0.6389011759602823, 'border_count': 15}. Best is trial 2 with value: 0.9049844236760125.
[I 2024-11-17 13:45:50,167] Trial 13 finished with value: 0.9049844236760125 and parameters: {'iterations': 654, 'learning_rate': 0.30597716266347286, 'depth': 8, 'l2_leaf_reg': 0.009444839542158428, 'border_count': 44}. Best is trial 2 with value: 0.9049844236760125.
[I 2024-11-17 13:45:50,711] Trial 14 finished with value: 0.9034267912772586 and parameters: {'iterations': 630, 'learning_rate': 0.12305344551037634, 'depth': 9, 'l2_leaf_reg': 0.008163175348480878, 'border_count': 46}. Best is trial 2 with value: 0.9049844236760125.
[I 2024-11-17 13:45:50,875] Trial 15 finished with value: 0.8925233644859814 and parameters: {'iterations': 400, 'learning_rate': 0.11368556148582416, 'depth': 2, 'l2_leaf_reg': 0.005518433238358547, 'border_count': 50}. Best is trial 2 with value: 0.9049844236760125.
[I 2024-11-17 13:45:51,065] Trial 16 finished with value: 0.9080996884735203 and parameters: {'iterations': 755, 'learning_rate': 0.2871205758879905, 'depth': 5, 'l2_leaf_reg': 0.024780255153663482, 'border_count': 146}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:45:51,285] Trial 17 finished with value: 0.9018691588785047 and parameters: {'iterations': 773, 'learning_rate': 0.1620882056442661, 'depth': 5, 'l2_leaf_reg': 0.03434367535838834, 'border_count': 148}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:45:51,432] Trial 18 finished with value: 0.8785046728971962 and parameters: {'iterations': 995, 'learning_rate': 0.0318146432961134, 'depth': 3, 'l2_leaf_reg': 0.0028986722679594707, 'border_count': 216}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:45:51,573] Trial 19 finished with value: 0.9018691588785047 and parameters: {'iterations': 539, 'learning_rate': 0.27793009238293925, 'depth': 5, 'l2_leaf_reg': 0.02398306722675385, 'border_count': 146}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:45:51,756] Trial 20 finished with value: 0.8987538940809969 and parameters: {'iterations': 777, 'learning_rate': 0.08871834438850093, 'depth': 4, 'l2_leaf_reg': 0.1921803776641235, 'border_count': 199}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:45:52,092] Trial 21 finished with value: 0.8940809968847352 and parameters: {'iterations': 608, 'learning_rate': 0.30732889042942796, 'depth': 8, 'l2_leaf_reg': 0.020759893519923734, 'border_count': 66}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:45:52,241] Trial 22 finished with value: 0.8987538940809969 and parameters: {'iterations': 713, 'learning_rate': 0.3504213439278834, 'depth': 6, 'l2_leaf_reg': 0.003991236494140233, 'border_count': 24}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:45:52,490] Trial 23 finished with value: 0.8987538940809969 and parameters: {'iterations': 587, 'learning_rate': 0.1607856244538126, 'depth': 8, 'l2_leaf_reg': 0.013506558713518815, 'border_count': 136}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:45:52,875] Trial 24 finished with value: 0.8987538940809969 and parameters: {'iterations': 836, 'learning_rate': 0.28395858682203007, 'depth': 9, 'l2_leaf_reg': 0.0022443374478459577, 'border_count': 165}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:45:53,065] Trial 25 finished with value: 0.9018691588785047 and parameters: {'iterations': 498, 'learning_rate': 0.1721321183291424, 'depth': 6, 'l2_leaf_reg': 0.046641043657561415, 'border_count': 59}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:45:53,172] Trial 26 finished with value: 0.8878504672897196 and parameters: {'iterations': 381, 'learning_rate': 0.36243831749728295, 'depth': 4, 'l2_leaf_reg': 0.016008184681680818, 'border_count': 92}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:45:53,419] Trial 27 finished with value: 0.8925233644859814 and parameters: {'iterations': 689, 'learning_rate': 0.23585186023439983, 'depth': 7, 'l2_leaf_reg': 0.005356765278727701, 'border_count': 254}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:45:54,106] Trial 28 finished with value: 0.9049844236760125 and parameters: {'iterations': 552, 'learning_rate': 0.08571205388582874, 'depth': 9, 'l2_leaf_reg': 0.0010182052282238377, 'border_count': 127}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:45:54,256] Trial 29 finished with value: 0.8878504672897196 and parameters: {'iterations': 298, 'learning_rate': 0.03761557145646596, 'depth': 5, 'l2_leaf_reg': 0.010495525652327342, 'border_count': 85}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:45:58,864] Trial 30 finished with value: 0.8925233644859814 and parameters: {'iterations': 247, 'learning_rate': 0.3580650150013142, 'depth': 12, 'l2_leaf_reg': 0.33457281051457227, 'border_count': 103}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:45:59,227] Trial 31 finished with value: 0.897196261682243 and parameters: {'iterations': 570, 'learning_rate': 0.13855597037035286, 'depth': 9, 'l2_leaf_reg': 0.0011956196319716467, 'border_count': 129}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:46:01,048] Trial 32 finished with value: 0.9034267912772586 and parameters: {'iterations': 647, 'learning_rate': 0.07860262862838216, 'depth': 10, 'l2_leaf_reg': 0.0034137487806150868, 'border_count': 157}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:46:02,132] Trial 33 finished with value: 0.9003115264797508 and parameters: {'iterations': 430, 'learning_rate': 0.10275303842928166, 'depth': 10, 'l2_leaf_reg': 0.0018450747243995555, 'border_count': 175}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:46:02,399] Trial 34 finished with value: 0.9049844236760125 and parameters: {'iterations': 773, 'learning_rate': 0.20607596843787218, 'depth': 8, 'l2_leaf_reg': 0.005586166549679907, 'border_count': 116}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:46:04,621] Trial 35 finished with value: 0.8800623052959502 and parameters: {'iterations': 527, 'learning_rate': 0.4063746342786786, 'depth': 12, 'l2_leaf_reg': 0.11101414205036296, 'border_count': 31}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:46:04,823] Trial 36 finished with value: 0.8987538940809969 and parameters: {'iterations': 736, 'learning_rate': 0.24919030318999677, 'depth': 7, 'l2_leaf_reg': 0.02834675204819334, 'border_count': 181}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:46:05,041] Trial 37 finished with value: 0.9003115264797508 and parameters: {'iterations': 811, 'learning_rate': 0.06307639419772083, 'depth': 3, 'l2_leaf_reg': 0.001019531705512783, 'border_count': 64}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:46:05,141] Trial 38 finished with value: 0.8255451713395638 and parameters: {'iterations': 321, 'learning_rate': 0.04309724934440145, 'depth': 1, 'l2_leaf_reg': 0.06145103312584726, 'border_count': 198}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:46:06,306] Trial 39 finished with value: 0.8909657320872274 and parameters: {'iterations': 854, 'learning_rate': 0.010993059720623108, 'depth': 11, 'l2_leaf_reg': 0.007282928333451568, 'border_count': 41}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:46:06,454] Trial 40 finished with value: 0.8800623052959502 and parameters: {'iterations': 146, 'learning_rate': 0.01684172055937142, 'depth': 6, 'l2_leaf_reg': 0.0026448047901044083, 'border_count': 117}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:46:06,737] Trial 41 finished with value: 0.9065420560747663 and parameters: {'iterations': 927, 'learning_rate': 0.1930341879621038, 'depth': 8, 'l2_leaf_reg': 0.0054986574065201705, 'border_count': 117}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:46:07,524] Trial 42 finished with value: 0.8987538940809969 and parameters: {'iterations': 935, 'learning_rate': 0.1848884791066882, 'depth': 9, 'l2_leaf_reg': 0.0017239875926621762, 'border_count': 141}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:46:07,804] Trial 43 finished with value: 0.9080996884735203 and parameters: {'iterations': 928, 'learning_rate': 0.1384729674448943, 'depth': 7, 'l2_leaf_reg': 0.01337607846469954, 'border_count': 108}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:46:08,066] Trial 44 finished with value: 0.9112149532710281 and parameters: {'iterations': 975, 'learning_rate': 0.14134624637568022, 'depth': 7, 'l2_leaf_reg': 0.01436999730090072, 'border_count': 80}. Best is trial 44 with value: 0.9112149532710281.
[I 2024-11-17 13:46:08,371] Trial 45 finished with value: 0.9049844236760125 and parameters: {'iterations': 938, 'learning_rate': 0.13846349740355113, 'depth': 7, 'l2_leaf_reg': 0.01477546436823995, 'border_count': 104}. Best is trial 44 with value: 0.9112149532710281.
[I 2024-11-17 13:46:08,521] Trial 46 finished with value: 0.9034267912772586 and parameters: {'iterations': 970, 'learning_rate': 0.20940646167936078, 'depth': 6, 'l2_leaf_reg': 0.03736577645146407, 'border_count': 76}. Best is trial 44 with value: 0.9112149532710281.
[I 2024-11-17 13:46:08,721] Trial 47 finished with value: 0.9049844236760125 and parameters: {'iterations': 903, 'learning_rate': 0.13784422117687906, 'depth': 7, 'l2_leaf_reg': 0.019038898090431865, 'border_count': 94}. Best is trial 44 with value: 0.9112149532710281.
[I 2024-11-17 13:46:08,960] Trial 48 finished with value: 0.9080996884735203 and parameters: {'iterations': 885, 'learning_rate': 0.101061508693785, 'depth': 5, 'l2_leaf_reg': 1.5790529095966501, 'border_count': 108}. Best is trial 44 with value: 0.9112149532710281.
[I 2024-11-17 13:46:09,158] Trial 49 finished with value: 0.9003115264797508 and parameters: {'iterations': 877, 'learning_rate': 0.10076971772135994, 'depth': 4, 'l2_leaf_reg': 1.1644950851988014, 'border_count': 126}. Best is trial 44 with value: 0.9112149532710281.
In [27]:
# 初始化模型并使用前面的最佳超参数
model = CatBoostClassifier(
verbose=0,
random_state=3,
**study.best_params
)
# 训练模型
model.fit(x_train, y_train)
# 预测
y_pred = model.predict(x_test)
# 打印模型评估指标
print('模型准确率:',accuracy_score(y_test,y_pred))
print (classification_report(y_pred, y_test))
plt.subplots(figsize=(20, 6))
sns.heatmap(confusion_matrix(y_pred, y_test), annot = True, fmt="d", cmap="Blues", linewidths=.5)
plt.show()
模型准确率: 0.9018691588785047
precision recall f1-score support
0 0.94 0.93 0.94 502
1 0.76 0.80 0.78 140
accuracy 0.90 642
macro avg 0.85 0.87 0.86 642
weighted avg 0.90 0.90 0.90 642
In [28]:
# 画出ROC曲线
y_prob = model.predict_proba(x_test)[:,1]
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_prob)
roc = auc(false_positive_rate, true_positive_rate)
plt.title('ROC')
plt.plot(false_positive_rate,true_positive_rate, color='red',label = 'AUC = %0.2f' % roc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],linestyle='--')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [29]:
# 模型预测
res = pd.DataFrame()
res['真实值'] = y_test
res['预测值'] = y_pred
res.sample(10)
Out[29]:
| 真实值 | 预测值 | |
|---|---|---|
| 1067 | 1 | 1 |
| 1878 | 0 | 0 |
| 1091 | 0 | 0 |
| 65 | 0 | 0 |
| 1025 | 1 | 1 |
| 2113 | 0 | 0 |
| 951 | 0 | 0 |
| 745 | 1 | 1 |
| 1188 | 1 | 0 |
| 581 | 0 | 0 |
In [ ]: