import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings,random,optuna
import plotly.express as px
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score,auc,roc_curve
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
plt.style.use('dark_background')
warnings.simplefilter('ignore', category=FutureWarning)
ds = pd.read_csv('AIDS_Classification.csv')
ds.head()
ds.shape
ds.info()
ds.describe(percentiles=[0, .25, .30, .50, .75, .80, 1]).T.style.background_gradient(cmap = 'inferno')
ds.isnull().sum()
ds.duplicated().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2139 entries, 0 to 2138
Data columns (total 23 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   time      2139 non-null   int64  
 1   trt       2139 non-null   int64  
 2   age       2139 non-null   int64  
 3   wtkg      2139 non-null   float64
 4   hemo      2139 non-null   int64  
 5   homo      2139 non-null   int64  
 6   drugs     2139 non-null   int64  
 7   karnof    2139 non-null   int64  
 8   oprior    2139 non-null   int64  
 9   z30       2139 non-null   int64  
 10  preanti   2139 non-null   int64  
 11  race      2139 non-null   int64  
 12  gender    2139 non-null   int64  
 13  str2      2139 non-null   int64  
 14  strat     2139 non-null   int64  
 15  symptom   2139 non-null   int64  
 16  treat     2139 non-null   int64  
 17  offtrt    2139 non-null   int64  
 18  cd40      2139 non-null   int64  
 19  cd420     2139 non-null   int64  
 20  cd80      2139 non-null   int64  
 21  cd820     2139 non-null   int64  
 22  infected  2139 non-null   int64  
dtypes: float64(1), int64(22)
memory usage: 384.5 KB

0

ds.describe(percentiles=[0, .25, .30, .50, .75, .80, 1]).T.style.background_gradient(cmap = 'inferno')

import matplotlib.font_manager as fm
# 列出所有可用字体
font_paths = fm.findSystemFonts()
for font_path in font_paths:
    print(fm.FontProperties(fname=font_path).get_name())

Symbol Tiger
Bookman Old Style
Segoe Script
Poor Richard
Arial
Segoe UI
STCaiyun
Calibri
Bookman Old Style
FangSong
Constantia
Rage Italic
Baskerville Old Face
Ebrima
Tw Cen MT Condensed
Trebuchet MS
Microsoft PhagsPa
Symbol
Bauhaus 93
Microsoft YaHei
Microsoft Tai Le
Euclid Math One
Calibri
Century Schoolbook
Dubai
LiSu
Segoe UI
Colonna MT
Tiger Expert
Goudy Old Style
Microsoft JhengHei
STHupo
Script MT Bold
Corbel
Euclid Extra
Segoe UI
Freestyle Script
Bell MT
Dubai
Javanese Text
Microsoft YaHei
Rockwell
Palace Script MT
Times New Roman
Sylfaen
Nirmala UI
Calibri
High Tower Text
Perpetua
Cambria
Century Gothic
Century Schoolbook
Yu Gothic
STZhongsong
Euclid Fraktur
Franklin Gothic Medium
Segoe UI
Mistral
Gigi
Candara
MingLiU-ExtB
MS Outlook
Courier New
Perpetua
Arial
Lucida Console
Microsoft Uighur
Perpetua
Lucida Fax
Euclid Symbol
Bodoni MT
Segoe MDL2 Assets
Elephant
Californian FB
Informal Roman
Bernard MT Condensed
Euclid
Sans Serif Collection
Tw Cen MT Condensed Extra Bold
Franklin Gothic Book
Berlin Sans FB
STXingkai
High Tower Text
Calibri
Lucida Bright
Magneto
Franklin Gothic Book
Lucida Calligraphy
Leelawadee
Euclid Math Two
Corbel
DengXian
Century Schoolbook
Euclid Fraktur
Onyx
Bodoni MT
Rockwell
Castellar
Wingdings
Calisto MT
MT Extra
Goudy Old Style
Kunstler Script
RM Pro
Gill Sans Ultra Bold
Segoe UI
Bodoni MT
Edwardian Script ITC
SimSun
Ravie
Verdana
Euclid
Consolas
Microsoft JhengHei
Lucida Sans Typewriter
Bodoni MT
Verdana
Trebuchet MS
Courier New
Jokerman
Segoe Script
HoloLens MDL2 Assets
Palatino Linotype
Franklin Gothic Demi Cond
Segoe UI
Century
Niagara Solid
Gadugi
Webdings
Arial
Tiger
Copperplate Gothic Light
Lucida Sans Typewriter
Modern No. 20
Cooper Black
Fences
Rockwell
Verdana
Gill Sans MT Ext Condensed Bold
Times New Roman
Arial
Agency FB
Sitka
Book Antiqua
MV Boli
Broadway
MT Extra Tiger
Garamond
ZWAdobeF
Tahoma
Forte
Lucida Bright
Comic Sans MS
Segoe UI Variable
Gabriola
Perpetua Titling MT
Courier New
Arial
Palatino Linotype
Calisto MT
Gill Sans MT
Microsoft New Tai Lue
Heebo
Lucida Bright
Yu Gothic
Segoe UI
Mongolian Baiti
Brush Script MT
Felix Titling
Yu Gothic
Franklin Gothic Medium Cond
MS Reference Sans Serif
DengXian
Georgia
Leelawadee UI
Segoe UI Historic
Arial
FZYaoTi
Symbol Tiger Expert
Imprint MT Shadow
Gill Sans MT
Microsoft Tai Le
Lucida Sans
Century Gothic
Eras Demi ITC
Calisto MT
MS Reference Specialty
Dubai
Bahnschrift
Rockwell Extra Bold
Ink Free
Nirmala UI
Microsoft New Tai Lue
Bookman Old Style
OCR A Extended
Vladimir Script
Goudy Old Style
Trebuchet MS
Gill Sans MT
Segoe UI
IQYHT
Euclid Symbol
Microsoft Yi Baiti
Cera PROModern
Cascadia Code
Calibri
Berlin Sans FB
Myanmar Text
Arial
Franklin Gothic Heavy
Gill Sans MT Condensed
Franklin Gothic Heavy
SimHei
Bodoni MT
Rockwell Condensed
Segoe UI Emoji
Bookman Old Style
Segoe Print
Gill Sans MT
Bradley Hand ITC
Malgun Gothic
Tw Cen MT Condensed
Snap ITC
French Script MT
Book Antiqua
??????
Niagara Engraved
STKaiti
STXinwei
Harlow Solid Italic
ShouShuti
Rockwell
DengXian
MS Gothic
Georgia
Vivaldi
Gill Sans Ultra Bold Condensed
Papyrus
Euclid Math One
Berlin Sans FB Demi
Times New Roman
Microsoft JhengHei
Lucida Fax
Lucida Sans
Lucida Sans Typewriter
Rockwell Condensed
Corbel
STSong
Malgun Gothic
Wingdings 2
Nirmala UI
Century Schoolbook
Kristen ITC
STLiti
Blackadder ITC
Myanmar Text
Arial
Yu Gothic
Times New Roman
Bell MT
Comic Sans MS
Arial Rounded MT Bold
Malgun Gothic
Elephant
Tw Cen MT
Tw Cen MT
Corbel
Candara
Bodoni MT
Segoe UI
Bodoni MT
Californian FB
STXihei
Euclid Symbol
Parchment
Sitka
IQYHT
Consolas
Bell MT
Comic Sans MS
Algerian
Euclid Math Two
Leelawadee UI
Segoe UI
Agency FB
Lucida Bright
Chiller
Centaur
Dubai
Lucida Fax
Segoe Print
Tw Cen MT
Courier New
Haettenschweiler
Bodoni MT
Comic Sans MS
Leelawadee UI
Lucida Bright
Trebuchet MS
Stencil
Lucida Handwriting
Microsoft YaHei
Franklin Gothic Demi
Franklin Gothic Demi
Corbel
Lucida Sans Typewriter
Calibri
KaiTi
Constantia
Showcard Gothic
MF QingShu (Noncommercial)
Lucida Sans Unicode
Impact
Eras Medium ITC
Euclid Extra
Segoe UI
FZShuTi
Corbel
Engravers MT
Calisto MT
Consolas
Bodoni MT
Perpetua Titling MT
Leelawadee
Lucida Sans
Ebrima
Curlz MT
Consolas
Bodoni MT
Candara
Matura MT Script Capitals
Cambria
Tahoma
Constantia
Microsoft Sans Serif
Cambria
Microsoft YaHei
Playbill
Verdana
Eras Light ITC
Segoe Fluent Icons
YouYuan
Gadugi
Century Gothic
SimSun-ExtG
Wingdings 3
CXD6763_Newest
Tempus Sans ITC
Euclid
Garamond
Perpetua
Juice ITC
Microsoft Uighur
Cambria
Bodoni MT
Microsoft PhagsPa
Harrington
Arial
Viner Hand ITC
DejaVu Math TeX Gyre
Footlight MT Light
Eras Bold ITC
Gloucester MT Extra Condensed
Maiandra GD
Constantia
SimSun-ExtB
Book Antiqua
Bookshelf Symbol 7
Palatino Linotype
Goudy Stout
Book Antiqua
Century Gothic
Cascadia Mono
Segoe UI
Euclid Symbol
Candara
Segoe UI Symbol
Californian FB
Candara
Lucida Sans
Copperplate Gothic Bold
Euclid
Georgia
Microsoft Himalaya
STFangsong
Franklin Gothic Medium
Old English Text MT
Georgia
Candara
Britannic Bold
Lucida Fax
Tw Cen MT
Garamond
Monotype Corsiva
Wide Latin
Pristina
Lucida Sans
Palatino Linotype

import matplotlib.pyplot as plt
def mPlotter(r, c, size, _targets, text):
    # 背景颜色
    bg = '#010108'
    # 颜色调色板
    palette = ['#df5337', '#d24644', '#f7d340', '#3339FF', '#440a68', '#84206b', '#f1ef75', '#fbbe23', '#400a67']
    # 字体设置
    font = 'Arial'
    # 创建图形
    fig = plt.figure(figsize=size)
    # 设置背景色
    fig.patch.set_facecolor(bg)
    # 创建网格布局
    grid = fig.add_gridspec(r, c)
    # 更新网格的间距
    grid.update(wspace=0.5, hspace=0.25)
    # 计算空白子图的数量
    __empty_diff = ((r * c) - 1) - len(_targets)
    # 存储所有的子图轴
    axes = []
    # 添加子图
    for i in range(r):
        for j in range(c):
            axes.append(fig.add_subplot(grid[i, j]))
    # 设置每个子图的样式
    for idx, ax in enumerate(axes):
        ax.set_facecolor(bg)
        if idx == 0:
            ax.spines["bottom"].set_visible(False)
            ax.tick_params(left=False, bottom=False)
            ax.set_xticklabels([])
            ax.set_yticklabels([])
            ax.text(0.5, 0.5,
                    f'{text}',
                    horizontalalignment='center',
                    verticalalignment='center',
                    fontsize=18,
                    fontweight='bold',
                    fontfamily=font,
                    color="#fff")
        else:
            if (idx - 1) < len(_targets):
                ax.set_title(_targets[idx - 1].capitalize(), fontsize=14, fontweight='bold', fontfamily=font, color="#fff")
                ax.grid(color='#fff', linestyle=':', axis='y', zorder=0, dashes=(1, 5))
                ax.set_xlabel("")
                ax.set_ylabel("")
            else:
                ax.spines["bottom"].set_visible(False)
                ax.tick_params(left=False, bottom=False)
                ax.set_xticklabels([])
                ax.set_yticklabels([])
                # 隐藏左右上下边框
                ax.spines["left"].set_visible(False)
                ax.spines["top"].set_visible(False)
                ax.spines["right"].set_visible(False)
    # 空白子图的回调函数
    def cb(ax):
        ax.set_xlabel("")
        ax.set_ylabel("")
    # 如果有空白子图，删除它们
    if __empty_diff > 0:
        axes = axes[:-1 * __empty_diff]
    return axes, palette, cb

target = 'infected'
cont_cols = ['time', 'age', 'wtkg', 'preanti', 'cd40', 'cd420', 'cd80', 'cd820']
dis_cols = list(set(ds.columns) - set([*cont_cols, target]))
len(cont_cols), len(dis_cols)

(8, 14)

axes, palette, cb = mPlotter(1, 2, (20, 5), [target], 'Count Of\nInfected Variable\n______________')
sns.countplot(x=ds[target], ax = axes[1], color=palette[0])
cb(axes[1])

axes, palette, cb = mPlotter(3, 3, (20, 20), cont_cols, 'KDE Plot of\nContinuous Variables\n________________')
for col, ax in zip(cont_cols, axes[1:]):
    sns.kdeplot(data=ds, x=col, ax=ax, hue=target, palette=palette[1:3], alpha=.5, linewidth=0, fill=True)
    cb(ax)

axes, palette, cb = mPlotter(3, 3, (20, 20), cont_cols, 'Boxen Plot of\nContinuous Variables\n________________')
for col, ax in zip(cont_cols, axes[1:]):
    sns.boxenplot(data=ds, y=col, ax=ax, palette=[palette[random.randint(0, len(palette)-1)]])
    cb(ax)

axes, palette, cb = mPlotter(5, 3, (20, 20), dis_cols, 'Countplot of\nDiscrete Variables\n________________')
for col, ax in zip(dis_cols, axes[1:]):
    sns.countplot(x=ds[col], ax = ax, hue=ds[target], palette=palette[6:8])
    cb(ax)

ax = px.scatter_3d(ds, x="age", y="wtkg", z="time", template= "plotly_dark", color="infected")
ax.show()

ax = px.scatter_3d(ds, x="preanti", y="cd40", z="cd420", template= "plotly_dark", color="infected")
ax.show()

ax = px.scatter_3d(ds, x="preanti", y="cd80", z="cd820", template= "plotly_dark", color="infected")
ax.show()

fig = plt.figure(figsize=(25, 8))
gs = fig.add_gridspec(1, 1)
gs.update(wspace=0.3, hspace=0.15)
ax = fig.add_subplot(gs[0, 0])
ax.set_title("Correlation Matrix", fontsize=28, fontweight='bold', fontfamily='serif', color="#fff")
sns.heatmap(ds[cont_cols].corr().transpose(), mask=np.triu(np.ones_like(ds[cont_cols].corr().transpose())), fmt=".1f", annot=True, cmap='Blues')
plt.show()

# 拆分数据集为训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(ds.iloc[:,:-1], ds.iloc[:, -1], random_state=3, train_size=.7)
x_train.shape, y_train.shape, x_test.shape, y_test.shape
# 平衡数据集
smote = SMOTE(random_state = 14)
x_train, y_train = smote.fit_resample(x_train, y_train)
x_train.shape, y_train.shape, x_test.shape, y_test.shape
# 数据标准化处理
x_train = MinMaxScaler().fit_transform(x_train)
x_test = MinMaxScaler().fit_transform(x_test)
# 找到catboost的最佳超参数!

def objective(trial):
    params = {
    'iterations': trial.suggest_int('iterations', 100, 1000),
    'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
    'depth': trial.suggest_int('depth', 1, 12),
    'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
    'border_count': trial.suggest_int('border_count', 1, 255),
    'thread_count': -1,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    'verbose': False
    }
    model = CatBoostClassifier(**params)
    model.fit(x_train, y_train, eval_set=(x_test, y_test), verbose=False, early_stopping_rounds=20)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, show_progress_bar=True)

[I 2024-11-17 13:45:42,287] A new study created in memory with name: no-name-9777cd51-18fc-42c5-8234-3c3619597f93

  0%|          | 0/50 [00:00<?, ?it/s]

[I 2024-11-17 13:45:44,316] Trial 0 finished with value: 0.8925233644859814 and parameters: {'iterations': 265, 'learning_rate': 0.05570902174985178, 'depth': 11, 'l2_leaf_reg': 0.5509390357470626, 'border_count': 97}. Best is trial 0 with value: 0.8925233644859814.
[I 2024-11-17 13:45:45,403] Trial 1 finished with value: 0.8925233644859814 and parameters: {'iterations': 741, 'learning_rate': 0.11797000147686394, 'depth': 11, 'l2_leaf_reg': 0.12886359808838865, 'border_count': 167}. Best is trial 0 with value: 0.8925233644859814.
[I 2024-11-17 13:45:45,556] Trial 2 finished with value: 0.9049844236760125 and parameters: {'iterations': 243, 'learning_rate': 0.25490821410259124, 'depth': 5, 'l2_leaf_reg': 0.0030390970133002274, 'border_count': 58}. Best is trial 2 with value: 0.9049844236760125.
[I 2024-11-17 13:45:47,386] Trial 3 finished with value: 0.8940809968847352 and parameters: {'iterations': 929, 'learning_rate': 0.4807205614362448, 'depth': 11, 'l2_leaf_reg': 8.202643510405666, 'border_count': 116}. Best is trial 2 with value: 0.9049844236760125.
[I 2024-11-17 13:45:47,517] Trial 4 finished with value: 0.8847352024922118 and parameters: {'iterations': 117, 'learning_rate': 0.19832904070472618, 'depth': 1, 'l2_leaf_reg': 0.0807745923638388, 'border_count': 75}. Best is trial 2 with value: 0.9049844236760125.
[I 2024-11-17 13:45:48,523] Trial 5 finished with value: 0.8894080996884736 and parameters: {'iterations': 352, 'learning_rate': 0.2052416671014467, 'depth': 10, 'l2_leaf_reg': 0.06382738853803058, 'border_count': 189}. Best is trial 2 with value: 0.9049844236760125.
[I 2024-11-17 13:45:48,823] Trial 6 finished with value: 0.9034267912772586 and parameters: {'iterations': 879, 'learning_rate': 0.42200838147029396, 'depth': 7, 'l2_leaf_reg': 0.6801952787988413, 'border_count': 35}. Best is trial 2 with value: 0.9049844236760125.
[I 2024-11-17 13:45:49,284] Trial 7 finished with value: 0.9003115264797508 and parameters: {'iterations': 880, 'learning_rate': 0.060297257902191585, 'depth': 8, 'l2_leaf_reg': 0.009574135079021997, 'border_count': 113}. Best is trial 2 with value: 0.9049844236760125.
[I 2024-11-17 13:45:49,422] Trial 8 finished with value: 0.8800623052959502 and parameters: {'iterations': 240, 'learning_rate': 0.02295486811630032, 'depth': 7, 'l2_leaf_reg': 2.9440213224913436, 'border_count': 99}. Best is trial 2 with value: 0.9049844236760125.
[I 2024-11-17 13:45:49,534] Trial 9 finished with value: 0.8894080996884736 and parameters: {'iterations': 153, 'learning_rate': 0.054502659870367336, 'depth': 6, 'l2_leaf_reg': 3.7700877500606174, 'border_count': 78}. Best is trial 2 with value: 0.9049844236760125.
[I 2024-11-17 13:45:49,634] Trial 10 finished with value: 0.8271028037383178 and parameters: {'iterations': 484, 'learning_rate': 0.011045274294233584, 'depth': 3, 'l2_leaf_reg': 0.0017578417881208433, 'border_count': 236}. Best is trial 2 with value: 0.9049844236760125.
[I 2024-11-17 13:45:49,743] Trial 11 finished with value: 0.8862928348909658 and parameters: {'iterations': 665, 'learning_rate': 0.4981157373638572, 'depth': 5, 'l2_leaf_reg': 0.0010996309176048054, 'border_count': 2}. Best is trial 2 with value: 0.9049844236760125.
[I 2024-11-17 13:45:49,885] Trial 12 finished with value: 0.9003115264797508 and parameters: {'iterations': 466, 'learning_rate': 0.2425926123313047, 'depth': 4, 'l2_leaf_reg': 0.6389011759602823, 'border_count': 15}. Best is trial 2 with value: 0.9049844236760125.
[I 2024-11-17 13:45:50,167] Trial 13 finished with value: 0.9049844236760125 and parameters: {'iterations': 654, 'learning_rate': 0.30597716266347286, 'depth': 8, 'l2_leaf_reg': 0.009444839542158428, 'border_count': 44}. Best is trial 2 with value: 0.9049844236760125.
[I 2024-11-17 13:45:50,711] Trial 14 finished with value: 0.9034267912772586 and parameters: {'iterations': 630, 'learning_rate': 0.12305344551037634, 'depth': 9, 'l2_leaf_reg': 0.008163175348480878, 'border_count': 46}. Best is trial 2 with value: 0.9049844236760125.
[I 2024-11-17 13:45:50,875] Trial 15 finished with value: 0.8925233644859814 and parameters: {'iterations': 400, 'learning_rate': 0.11368556148582416, 'depth': 2, 'l2_leaf_reg': 0.005518433238358547, 'border_count': 50}. Best is trial 2 with value: 0.9049844236760125.
[I 2024-11-17 13:45:51,065] Trial 16 finished with value: 0.9080996884735203 and parameters: {'iterations': 755, 'learning_rate': 0.2871205758879905, 'depth': 5, 'l2_leaf_reg': 0.024780255153663482, 'border_count': 146}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:45:51,285] Trial 17 finished with value: 0.9018691588785047 and parameters: {'iterations': 773, 'learning_rate': 0.1620882056442661, 'depth': 5, 'l2_leaf_reg': 0.03434367535838834, 'border_count': 148}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:45:51,432] Trial 18 finished with value: 0.8785046728971962 and parameters: {'iterations': 995, 'learning_rate': 0.0318146432961134, 'depth': 3, 'l2_leaf_reg': 0.0028986722679594707, 'border_count': 216}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:45:51,573] Trial 19 finished with value: 0.9018691588785047 and parameters: {'iterations': 539, 'learning_rate': 0.27793009238293925, 'depth': 5, 'l2_leaf_reg': 0.02398306722675385, 'border_count': 146}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:45:51,756] Trial 20 finished with value: 0.8987538940809969 and parameters: {'iterations': 777, 'learning_rate': 0.08871834438850093, 'depth': 4, 'l2_leaf_reg': 0.1921803776641235, 'border_count': 199}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:45:52,092] Trial 21 finished with value: 0.8940809968847352 and parameters: {'iterations': 608, 'learning_rate': 0.30732889042942796, 'depth': 8, 'l2_leaf_reg': 0.020759893519923734, 'border_count': 66}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:45:52,241] Trial 22 finished with value: 0.8987538940809969 and parameters: {'iterations': 713, 'learning_rate': 0.3504213439278834, 'depth': 6, 'l2_leaf_reg': 0.003991236494140233, 'border_count': 24}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:45:52,490] Trial 23 finished with value: 0.8987538940809969 and parameters: {'iterations': 587, 'learning_rate': 0.1607856244538126, 'depth': 8, 'l2_leaf_reg': 0.013506558713518815, 'border_count': 136}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:45:52,875] Trial 24 finished with value: 0.8987538940809969 and parameters: {'iterations': 836, 'learning_rate': 0.28395858682203007, 'depth': 9, 'l2_leaf_reg': 0.0022443374478459577, 'border_count': 165}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:45:53,065] Trial 25 finished with value: 0.9018691588785047 and parameters: {'iterations': 498, 'learning_rate': 0.1721321183291424, 'depth': 6, 'l2_leaf_reg': 0.046641043657561415, 'border_count': 59}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:45:53,172] Trial 26 finished with value: 0.8878504672897196 and parameters: {'iterations': 381, 'learning_rate': 0.36243831749728295, 'depth': 4, 'l2_leaf_reg': 0.016008184681680818, 'border_count': 92}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:45:53,419] Trial 27 finished with value: 0.8925233644859814 and parameters: {'iterations': 689, 'learning_rate': 0.23585186023439983, 'depth': 7, 'l2_leaf_reg': 0.005356765278727701, 'border_count': 254}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:45:54,106] Trial 28 finished with value: 0.9049844236760125 and parameters: {'iterations': 552, 'learning_rate': 0.08571205388582874, 'depth': 9, 'l2_leaf_reg': 0.0010182052282238377, 'border_count': 127}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:45:54,256] Trial 29 finished with value: 0.8878504672897196 and parameters: {'iterations': 298, 'learning_rate': 0.03761557145646596, 'depth': 5, 'l2_leaf_reg': 0.010495525652327342, 'border_count': 85}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:45:58,864] Trial 30 finished with value: 0.8925233644859814 and parameters: {'iterations': 247, 'learning_rate': 0.3580650150013142, 'depth': 12, 'l2_leaf_reg': 0.33457281051457227, 'border_count': 103}. Best is trial 16 with value: 0.9080996884735203.

[I 2024-11-17 13:45:59,227] Trial 31 finished with value: 0.897196261682243 and parameters: {'iterations': 570, 'learning_rate': 0.13855597037035286, 'depth': 9, 'l2_leaf_reg': 0.0011956196319716467, 'border_count': 129}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:46:01,048] Trial 32 finished with value: 0.9034267912772586 and parameters: {'iterations': 647, 'learning_rate': 0.07860262862838216, 'depth': 10, 'l2_leaf_reg': 0.0034137487806150868, 'border_count': 157}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:46:02,132] Trial 33 finished with value: 0.9003115264797508 and parameters: {'iterations': 430, 'learning_rate': 0.10275303842928166, 'depth': 10, 'l2_leaf_reg': 0.0018450747243995555, 'border_count': 175}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:46:02,399] Trial 34 finished with value: 0.9049844236760125 and parameters: {'iterations': 773, 'learning_rate': 0.20607596843787218, 'depth': 8, 'l2_leaf_reg': 0.005586166549679907, 'border_count': 116}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:46:04,621] Trial 35 finished with value: 0.8800623052959502 and parameters: {'iterations': 527, 'learning_rate': 0.4063746342786786, 'depth': 12, 'l2_leaf_reg': 0.11101414205036296, 'border_count': 31}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:46:04,823] Trial 36 finished with value: 0.8987538940809969 and parameters: {'iterations': 736, 'learning_rate': 0.24919030318999677, 'depth': 7, 'l2_leaf_reg': 0.02834675204819334, 'border_count': 181}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:46:05,041] Trial 37 finished with value: 0.9003115264797508 and parameters: {'iterations': 811, 'learning_rate': 0.06307639419772083, 'depth': 3, 'l2_leaf_reg': 0.001019531705512783, 'border_count': 64}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:46:05,141] Trial 38 finished with value: 0.8255451713395638 and parameters: {'iterations': 321, 'learning_rate': 0.04309724934440145, 'depth': 1, 'l2_leaf_reg': 0.06145103312584726, 'border_count': 198}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:46:06,306] Trial 39 finished with value: 0.8909657320872274 and parameters: {'iterations': 854, 'learning_rate': 0.010993059720623108, 'depth': 11, 'l2_leaf_reg': 0.007282928333451568, 'border_count': 41}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:46:06,454] Trial 40 finished with value: 0.8800623052959502 and parameters: {'iterations': 146, 'learning_rate': 0.01684172055937142, 'depth': 6, 'l2_leaf_reg': 0.0026448047901044083, 'border_count': 117}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:46:06,737] Trial 41 finished with value: 0.9065420560747663 and parameters: {'iterations': 927, 'learning_rate': 0.1930341879621038, 'depth': 8, 'l2_leaf_reg': 0.0054986574065201705, 'border_count': 117}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:46:07,524] Trial 42 finished with value: 0.8987538940809969 and parameters: {'iterations': 935, 'learning_rate': 0.1848884791066882, 'depth': 9, 'l2_leaf_reg': 0.0017239875926621762, 'border_count': 141}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:46:07,804] Trial 43 finished with value: 0.9080996884735203 and parameters: {'iterations': 928, 'learning_rate': 0.1384729674448943, 'depth': 7, 'l2_leaf_reg': 0.01337607846469954, 'border_count': 108}. Best is trial 16 with value: 0.9080996884735203.
[I 2024-11-17 13:46:08,066] Trial 44 finished with value: 0.9112149532710281 and parameters: {'iterations': 975, 'learning_rate': 0.14134624637568022, 'depth': 7, 'l2_leaf_reg': 0.01436999730090072, 'border_count': 80}. Best is trial 44 with value: 0.9112149532710281.
[I 2024-11-17 13:46:08,371] Trial 45 finished with value: 0.9049844236760125 and parameters: {'iterations': 938, 'learning_rate': 0.13846349740355113, 'depth': 7, 'l2_leaf_reg': 0.01477546436823995, 'border_count': 104}. Best is trial 44 with value: 0.9112149532710281.
[I 2024-11-17 13:46:08,521] Trial 46 finished with value: 0.9034267912772586 and parameters: {'iterations': 970, 'learning_rate': 0.20940646167936078, 'depth': 6, 'l2_leaf_reg': 0.03736577645146407, 'border_count': 76}. Best is trial 44 with value: 0.9112149532710281.
[I 2024-11-17 13:46:08,721] Trial 47 finished with value: 0.9049844236760125 and parameters: {'iterations': 903, 'learning_rate': 0.13784422117687906, 'depth': 7, 'l2_leaf_reg': 0.019038898090431865, 'border_count': 94}. Best is trial 44 with value: 0.9112149532710281.
[I 2024-11-17 13:46:08,960] Trial 48 finished with value: 0.9080996884735203 and parameters: {'iterations': 885, 'learning_rate': 0.101061508693785, 'depth': 5, 'l2_leaf_reg': 1.5790529095966501, 'border_count': 108}. Best is trial 44 with value: 0.9112149532710281.
[I 2024-11-17 13:46:09,158] Trial 49 finished with value: 0.9003115264797508 and parameters: {'iterations': 877, 'learning_rate': 0.10076971772135994, 'depth': 4, 'l2_leaf_reg': 1.1644950851988014, 'border_count': 126}. Best is trial 44 with value: 0.9112149532710281.

# 初始化模型并使用前面的最佳超参数
model = CatBoostClassifier(
verbose=0,
random_state=3,
**study.best_params
)
# 训练模型
model.fit(x_train, y_train)
# 预测
y_pred = model.predict(x_test)
# 打印模型评估指标
print('模型准确率：',accuracy_score(y_test,y_pred))
print (classification_report(y_pred, y_test))
plt.subplots(figsize=(20, 6))
sns.heatmap(confusion_matrix(y_pred, y_test), annot = True, fmt="d", cmap="Blues", linewidths=.5)
plt.show()

模型准确率： 0.9018691588785047
              precision    recall  f1-score   support

           0       0.94      0.93      0.94       502
           1       0.76      0.80      0.78       140

    accuracy                           0.90       642
   macro avg       0.85      0.87      0.86       642
weighted avg       0.90      0.90      0.90       642

# 画出ROC曲线
y_prob = model.predict_proba(x_test)[:,1]
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_prob)
roc = auc(false_positive_rate, true_positive_rate)
plt.title('ROC')
plt.plot(false_positive_rate,true_positive_rate, color='red',label = 'AUC = %0.2f' % roc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],linestyle='--')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# 模型预测
res = pd.DataFrame()
res['真实值'] = y_test
res['预测值'] = y_pred
res.sample(10)

	count	mean	std	min	0%	25%	30%	50%	75%	80%	100%	max
time	2139.000000	879.098177	292.274324	14.000000	14.000000	727.000000	825.000000	997.000000	1091.000000	1105.000000	1231.000000	1231.000000
trt	2139.000000	1.520804	1.127890	0.000000	0.000000	1.000000	1.000000	2.000000	3.000000	3.000000	3.000000	3.000000
age	2139.000000	35.248247	8.709026	12.000000	12.000000	29.000000	30.000000	34.000000	40.000000	42.000000	70.000000	70.000000
wtkg	2139.000000	75.125311	13.263164	31.000000	31.000000	66.679200	68.200000	74.390400	82.555200	84.809280	159.939360	159.939360
hemo	2139.000000	0.084151	0.277680	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	1.000000
homo	2139.000000	0.661057	0.473461	0.000000	0.000000	0.000000	0.000000	1.000000	1.000000	1.000000	1.000000	1.000000
drugs	2139.000000	0.131370	0.337883	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	1.000000
karnof	2139.000000	95.446470	5.900985	70.000000	70.000000	90.000000	90.000000	100.000000	100.000000	100.000000	100.000000	100.000000
oprior	2139.000000	0.021973	0.146629	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	1.000000
z30	2139.000000	0.550257	0.497584	0.000000	0.000000	0.000000	0.000000	1.000000	1.000000	1.000000	1.000000	1.000000
preanti	2139.000000	379.175783	468.657526	0.000000	0.000000	0.000000	0.000000	142.000000	739.500000	842.000000	2851.000000	2851.000000
race	2139.000000	0.288453	0.453149	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	1.000000	1.000000	1.000000
gender	2139.000000	0.827957	0.377506	0.000000	0.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000
str2	2139.000000	0.585788	0.492701	0.000000	0.000000	0.000000	0.000000	1.000000	1.000000	1.000000	1.000000	1.000000
strat	2139.000000	1.979897	0.899053	1.000000	1.000000	1.000000	1.000000	2.000000	3.000000	3.000000	3.000000	3.000000
symptom	2139.000000	0.172978	0.378317	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	1.000000
treat	2139.000000	0.751286	0.432369	0.000000	0.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000
offtrt	2139.000000	0.362786	0.480916	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	1.000000	1.000000	1.000000
cd40	2139.000000	350.501169	118.573863	0.000000	0.000000	263.500000	280.000000	340.000000	423.000000	444.400000	1199.000000	1199.000000
cd420	2139.000000	371.307153	144.634909	49.000000	49.000000	269.000000	285.000000	353.000000	460.000000	486.000000	1119.000000	1119.000000
cd80	2139.000000	986.627396	480.197750	40.000000	40.000000	654.000000	700.000000	893.000000	1207.000000	1305.000000	5011.000000	5011.000000
cd820	2139.000000	935.369799	444.976051	124.000000	124.000000	631.500000	678.000000	865.000000	1146.500000	1223.000000	6035.000000	6035.000000
infected	2139.000000	0.243572	0.429338	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	1.000000	1.000000

	真实值	预测值
1067	1	1
1878	0	0
1091	0	0
65	0	0
1025	1	1
2113	0	0
951	0	0
745	1	1
1188	1	0
581	0	0

📘 基于Catboost算法的艾滋病数据可视化与建模分析/catboost.ipynb