import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import time
import warnings
warnings.filterwarnings('ignore')
train_df = pd.read_csv('Fraudulent_E-Commerce_Transaction_Data.csv')
train_df.head()
train_df.shape
train_df.info()
train_df.describe()
# 我们可以看到客户年龄最小值是-16，这是不可能的，所以我们必须修复客户年龄列

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1472952 entries, 0 to 1472951
Data columns (total 16 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   Transaction ID      1472952 non-null  object 
 1   Customer ID         1472952 non-null  object 
 2   Transaction Amount  1472952 non-null  float64
 3   Transaction Date    1472952 non-null  object 
 4   Payment Method      1472952 non-null  object 
 5   Product Category    1472952 non-null  object 
 6   Quantity            1472952 non-null  int64  
 7   Customer Age        1472952 non-null  int64  
 8   Customer Location   1472952 non-null  object 
 9   Device Used         1472952 non-null  object 
 10  IP Address          1472952 non-null  object 
 11  Shipping Address    1472952 non-null  object 
 12  Billing Address     1472952 non-null  object 
 13  Is Fraudulent       1472952 non-null  int64  
 14  Account Age Days    1472952 non-null  int64  
 15  Transaction Hour    1472952 non-null  int64  
dtypes: float64(1), int64(5), object(10)
memory usage: 179.8+ MB

train_df.describe(include='O')
train_df.isnull().sum()
train_df.duplicated().sum()
px.box(data_frame=train_df,x="Customer Age",width=500,height=300)
def clean_data(df) -> pd.DataFrame:
    ##我们需要将**Transaction Date**列从对象类型转换为日期时间格式。
    df["Transaction Date"] = pd.to_datetime(df["Transaction Date"])
    ## 从交易日期提取日、周、月
    df['Transaction Day'] = df["Transaction Date"].dt.day
    df["Transaction DOW"] = df["Transaction Date"].dt.day_of_week
    df["Transaction Month"] = df["Transaction Date"].dt.month
    ## 修复客户年龄
    mean_value = np.round(df['Customer Age'].mean(),0)
    df['Customer Age'] = np.where(df['Customer Age'] <= -9,
    np.abs(df['Customer Age']),
    df['Customer Age'])
    df['Customer Age'] = np.where(df['Customer Age'] < 9,
    mean_value,
    df['Customer Age'])
    ## 如果“Shipping Address”与“Billing Address”相同，则取值为1，否则取值为0。
    df["Is Address Match"] = (df["Shipping Address"] == df["Billing Address"]).astype(int)
    ### 删除不相关的特征并向下转换数据类型以减小数据集大小
    df.drop(columns=["Transaction ID", "Customer ID", "Customer Location",
    "IP Address", "Transaction Date","Shipping Address","Billing Address"], inplace=True)
    int_col = df.select_dtypes(include="int").columns
    float_col = df.select_dtypes(include="float").columns
    df[int_col] = df[int_col].apply(pd.to_numeric, downcast='integer')
    df[float_col] = df[float_col].apply(pd.to_numeric, downcast='float')
    return df
train_df = clean_data(train_df)
train_df.head()
train_df.info()
# 数据集已被清理和压缩，其大小从180 MB减少到57MB。

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1472952 entries, 0 to 1472951
Data columns (total 13 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   Transaction Amount  1472952 non-null  float32
 1   Payment Method      1472952 non-null  object 
 2   Product Category    1472952 non-null  object 
 3   Quantity            1472952 non-null  int8   
 4   Customer Age        1472952 non-null  float32
 5   Device Used         1472952 non-null  object 
 6   Is Fraudulent       1472952 non-null  int8   
 7   Account Age Days    1472952 non-null  int16  
 8   Transaction Hour    1472952 non-null  int8   
 9   Transaction Day     1472952 non-null  int8   
 10  Transaction DOW     1472952 non-null  int8   
 11  Transaction Month   1472952 non-null  int8   
 12  Is Address Match    1472952 non-null  int8   
dtypes: float32(2), int16(1), int8(7), object(3)
memory usage: 57.6+ MB

plt.figure(figsize=(10,4))
sns.histplot(train_df["Transaction Amount"],bins=200)
plt.show()
# 交易金额普遍分布在0到1000之间
# 数据是右偏的。

payment_count = train_df["Payment Method"].value_counts()
plt.figure(figsize=(15,4))
plt.subplot(1,2,1)
sns.set_palette('pastel')
colors = sns.color_palette()
plt.pie(payment_count,labels = payment_count.index,shadow=True,autopct='%1.1f%%',
colors=colors,
wedgeprops=dict(width=0.8,edgecolor="w"))
plt.title("Payment Method")
plt.subplot(1,2,2)
sns.countplot(data=train_df,x="Payment Method",edgecolor="black",linewidth=1, palette="Set2")
plt.show()
# 我们可以看到所有的付款方式都是平均分配的

category_count = train_df["Product Category"].value_counts()
plt.figure(figsize=(15,4))
plt.subplot(1,2,1)
sns.set_palette('pastel')
colors = sns.color_palette()
plt.pie(category_count,labels = category_count.index,shadow=True,autopct='%1.1f%%',
colors=colors,
wedgeprops=dict(width=0.8,edgecolor="w"))
plt.title("Product Category")
plt.subplot(1,2,2)
ax = sns.countplot(data=train_df,x="Product Category",edgecolor="black",linewidth=1, palette="Set2")
# 我们可以看到，所有的产品类别也是均匀分布的

quantity_count = train_df["Quantity"].value_counts()
plt.figure(figsize=(15,4))
plt.subplot(1,2,1)
sns.set_palette('pastel')
colors = sns.color_palette()
plt.pie(quantity_count,labels = quantity_count.index,shadow=True,autopct='%1.1f%%',
colors=colors,
wedgeprops=dict(width=0.8,edgecolor="w"))
plt.title("Quantity")
plt.subplot(1,2,2)
ax = sns.countplot(data=train_df,x="Quantity",edgecolor="black",linewidth=1, palette="Set2")
plt.figure(figsize=(7,4))
sns.histplot(data=train_df, x="Customer Age",bins=150,kde=True,color='orange')
plt.show()
device_count = train_df["Device Used"].value_counts()
plt.figure(figsize=(15,4))
plt.subplot(1,2,1)
sns.set_palette('pastel')
colors = sns.color_palette()
plt.pie(device_count,labels = device_count.index,shadow=True,autopct='%1.1f%%',
colors=colors,
wedgeprops=dict(width=0.8,edgecolor="w"))
plt.title("Device Used")
plt.subplot(1,2,2)
ax = sns.countplot(data=train_df,x="Device Used",edgecolor="black",linewidth=1, palette="Set2")
hour_count = train_df["Transaction Hour"].value_counts().head(15)
plt.figure(figsize=(10,4))
sns.set_palette('Set2')
colors = sns.color_palette()
ax=sns.barplot(x=hour_count.index, y=hour_count.values,palette=colors)
plt.xticks(rotation=80)
plt.show()
plt.figure(figsize=(5,3))
sns.violinplot(data=train_df, x='Is Fraudulent', y='Transaction Amount')
plt.show()
column = ['Payment Method', 'Product Category',
'Quantity', 'Device Used','Transaction DOW',
'Transaction Month','Is Address Match']
plt.figure(figsize=(10,35))
plot_num = 1
for col in column:
    plt.subplot(10,2,plot_num)
    sns.countplot(data=train_df, x=col, hue="Is Fraudulent")
    plt.xticks(rotation=90)
    plt.title(col)
    plt.tight_layout()
    plot_num += 1
plt.figure(figsize=(10,6))
plt.subplot(1,2,1)
sns.boxenplot(x='Is Fraudulent', y='Transaction Amount', data=train_df)
plt.subplot(1,2,2)
sns.boxenplot(x='Is Fraudulent', y='Transaction Day', data=train_df)
plt.yticks(np.arange(0,32))
plt.show()

	Transaction Amount	Quantity	Customer Age	Is Fraudulent	Account Age Days	Transaction Hour
count	1.472952e+06	1.472952e+06	1.472952e+06	1.472952e+06	1.472952e+06	1.472952e+06
mean	2.267683e+02	3.000230e+00	3.451208e+01	5.012926e-02	1.796464e+02	1.128696e+01
std	2.703951e+02	1.414736e+00	1.000313e+01	2.182117e-01	1.068642e+02	6.975995e+00
min	1.000000e+01	1.000000e+00	-1.600000e+01	0.000000e+00	1.000000e+00	0.000000e+00
25%	6.861000e+01	2.000000e+00	2.800000e+01	0.000000e+00	8.600000e+01	5.000000e+00
50%	1.517600e+02	3.000000e+00	3.500000e+01	0.000000e+00	1.790000e+02	1.100000e+01
75%	2.960500e+02	4.000000e+00	4.100000e+01	0.000000e+00	2.720000e+02	1.700000e+01
max	1.270175e+04	5.000000e+00	8.600000e+01	1.000000e+00	3.650000e+02	2.300000e+01

📘 欺诈性电子商务交易数据集可视化分析/01.ipynb