In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import time
import warnings
warnings.filterwarnings('ignore')
train_df = pd.read_csv('Fraudulent_E-Commerce_Transaction_Data.csv')
train_df.head()
train_df.shape
train_df.info()
train_df.describe()
# 我们可以看到客户年龄最小值是-16,这是不可能的,所以我们必须修复客户年龄列
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1472952 entries, 0 to 1472951 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Transaction ID 1472952 non-null object 1 Customer ID 1472952 non-null object 2 Transaction Amount 1472952 non-null float64 3 Transaction Date 1472952 non-null object 4 Payment Method 1472952 non-null object 5 Product Category 1472952 non-null object 6 Quantity 1472952 non-null int64 7 Customer Age 1472952 non-null int64 8 Customer Location 1472952 non-null object 9 Device Used 1472952 non-null object 10 IP Address 1472952 non-null object 11 Shipping Address 1472952 non-null object 12 Billing Address 1472952 non-null object 13 Is Fraudulent 1472952 non-null int64 14 Account Age Days 1472952 non-null int64 15 Transaction Hour 1472952 non-null int64 dtypes: float64(1), int64(5), object(10) memory usage: 179.8+ MB
Out[2]:
| Transaction Amount | Quantity | Customer Age | Is Fraudulent | Account Age Days | Transaction Hour | |
|---|---|---|---|---|---|---|
| count | 1.472952e+06 | 1.472952e+06 | 1.472952e+06 | 1.472952e+06 | 1.472952e+06 | 1.472952e+06 |
| mean | 2.267683e+02 | 3.000230e+00 | 3.451208e+01 | 5.012926e-02 | 1.796464e+02 | 1.128696e+01 |
| std | 2.703951e+02 | 1.414736e+00 | 1.000313e+01 | 2.182117e-01 | 1.068642e+02 | 6.975995e+00 |
| min | 1.000000e+01 | 1.000000e+00 | -1.600000e+01 | 0.000000e+00 | 1.000000e+00 | 0.000000e+00 |
| 25% | 6.861000e+01 | 2.000000e+00 | 2.800000e+01 | 0.000000e+00 | 8.600000e+01 | 5.000000e+00 |
| 50% | 1.517600e+02 | 3.000000e+00 | 3.500000e+01 | 0.000000e+00 | 1.790000e+02 | 1.100000e+01 |
| 75% | 2.960500e+02 | 4.000000e+00 | 4.100000e+01 | 0.000000e+00 | 2.720000e+02 | 1.700000e+01 |
| max | 1.270175e+04 | 5.000000e+00 | 8.600000e+01 | 1.000000e+00 | 3.650000e+02 | 2.300000e+01 |
In [3]:
train_df.describe(include='O')
train_df.isnull().sum()
train_df.duplicated().sum()
px.box(data_frame=train_df,x="Customer Age",width=500,height=300)
def clean_data(df) -> pd.DataFrame:
##我们需要将**Transaction Date**列从对象类型转换为日期时间格式。
df["Transaction Date"] = pd.to_datetime(df["Transaction Date"])
## 从交易日期提取日、周、月
df['Transaction Day'] = df["Transaction Date"].dt.day
df["Transaction DOW"] = df["Transaction Date"].dt.day_of_week
df["Transaction Month"] = df["Transaction Date"].dt.month
## 修复客户年龄
mean_value = np.round(df['Customer Age'].mean(),0)
df['Customer Age'] = np.where(df['Customer Age'] <= -9,
np.abs(df['Customer Age']),
df['Customer Age'])
df['Customer Age'] = np.where(df['Customer Age'] < 9,
mean_value,
df['Customer Age'])
## 如果“Shipping Address”与“Billing Address”相同,则取值为1,否则取值为0。
df["Is Address Match"] = (df["Shipping Address"] == df["Billing Address"]).astype(int)
### 删除不相关的特征并向下转换数据类型以减小数据集大小
df.drop(columns=["Transaction ID", "Customer ID", "Customer Location",
"IP Address", "Transaction Date","Shipping Address","Billing Address"], inplace=True)
int_col = df.select_dtypes(include="int").columns
float_col = df.select_dtypes(include="float").columns
df[int_col] = df[int_col].apply(pd.to_numeric, downcast='integer')
df[float_col] = df[float_col].apply(pd.to_numeric, downcast='float')
return df
train_df = clean_data(train_df)
train_df.head()
train_df.info()
# 数据集已被清理和压缩,其大小从180 MB减少到57MB。
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1472952 entries, 0 to 1472951 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Transaction Amount 1472952 non-null float32 1 Payment Method 1472952 non-null object 2 Product Category 1472952 non-null object 3 Quantity 1472952 non-null int8 4 Customer Age 1472952 non-null float32 5 Device Used 1472952 non-null object 6 Is Fraudulent 1472952 non-null int8 7 Account Age Days 1472952 non-null int16 8 Transaction Hour 1472952 non-null int8 9 Transaction Day 1472952 non-null int8 10 Transaction DOW 1472952 non-null int8 11 Transaction Month 1472952 non-null int8 12 Is Address Match 1472952 non-null int8 dtypes: float32(2), int16(1), int8(7), object(3) memory usage: 57.6+ MB
In [4]:
plt.figure(figsize=(10,4))
sns.histplot(train_df["Transaction Amount"],bins=200)
plt.show()
# 交易金额普遍分布在0到1000之间
# 数据是右偏的。
In [5]:
payment_count = train_df["Payment Method"].value_counts()
plt.figure(figsize=(15,4))
plt.subplot(1,2,1)
sns.set_palette('pastel')
colors = sns.color_palette()
plt.pie(payment_count,labels = payment_count.index,shadow=True,autopct='%1.1f%%',
colors=colors,
wedgeprops=dict(width=0.8,edgecolor="w"))
plt.title("Payment Method")
plt.subplot(1,2,2)
sns.countplot(data=train_df,x="Payment Method",edgecolor="black",linewidth=1, palette="Set2")
plt.show()
# 我们可以看到所有的付款方式都是平均分配的
In [6]:
category_count = train_df["Product Category"].value_counts()
plt.figure(figsize=(15,4))
plt.subplot(1,2,1)
sns.set_palette('pastel')
colors = sns.color_palette()
plt.pie(category_count,labels = category_count.index,shadow=True,autopct='%1.1f%%',
colors=colors,
wedgeprops=dict(width=0.8,edgecolor="w"))
plt.title("Product Category")
plt.subplot(1,2,2)
ax = sns.countplot(data=train_df,x="Product Category",edgecolor="black",linewidth=1, palette="Set2")
# 我们可以看到,所有的产品类别也是均匀分布的
In [7]:
quantity_count = train_df["Quantity"].value_counts()
plt.figure(figsize=(15,4))
plt.subplot(1,2,1)
sns.set_palette('pastel')
colors = sns.color_palette()
plt.pie(quantity_count,labels = quantity_count.index,shadow=True,autopct='%1.1f%%',
colors=colors,
wedgeprops=dict(width=0.8,edgecolor="w"))
plt.title("Quantity")
plt.subplot(1,2,2)
ax = sns.countplot(data=train_df,x="Quantity",edgecolor="black",linewidth=1, palette="Set2")
plt.figure(figsize=(7,4))
sns.histplot(data=train_df, x="Customer Age",bins=150,kde=True,color='orange')
plt.show()
device_count = train_df["Device Used"].value_counts()
plt.figure(figsize=(15,4))
plt.subplot(1,2,1)
sns.set_palette('pastel')
colors = sns.color_palette()
plt.pie(device_count,labels = device_count.index,shadow=True,autopct='%1.1f%%',
colors=colors,
wedgeprops=dict(width=0.8,edgecolor="w"))
plt.title("Device Used")
plt.subplot(1,2,2)
ax = sns.countplot(data=train_df,x="Device Used",edgecolor="black",linewidth=1, palette="Set2")
hour_count = train_df["Transaction Hour"].value_counts().head(15)
plt.figure(figsize=(10,4))
sns.set_palette('Set2')
colors = sns.color_palette()
ax=sns.barplot(x=hour_count.index, y=hour_count.values,palette=colors)
plt.xticks(rotation=80)
plt.show()
plt.figure(figsize=(5,3))
sns.violinplot(data=train_df, x='Is Fraudulent', y='Transaction Amount')
plt.show()
column = ['Payment Method', 'Product Category',
'Quantity', 'Device Used','Transaction DOW',
'Transaction Month','Is Address Match']
plt.figure(figsize=(10,35))
plot_num = 1
for col in column:
plt.subplot(10,2,plot_num)
sns.countplot(data=train_df, x=col, hue="Is Fraudulent")
plt.xticks(rotation=90)
plt.title(col)
plt.tight_layout()
plot_num += 1
plt.figure(figsize=(10,6))
plt.subplot(1,2,1)
sns.boxenplot(x='Is Fraudulent', y='Transaction Amount', data=train_df)
plt.subplot(1,2,2)
sns.boxenplot(x='Is Fraudulent', y='Transaction Day', data=train_df)
plt.yticks(np.arange(0,32))
plt.show()
In [ ]: