In [2]:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
# 读取数据
data = pd.read_csv('online_retail.csv')
# 数据清洗
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data = data[data['Quantity'] > 0] # 只保留正数量的记录
# 计算RFM值
snapshot_date = data['InvoiceDate'].max() + pd.Timedelta(days=1)
rfm = data.groupby('CustomerID').agg({
'InvoiceDate': lambda x: (snapshot_date - x.max()).days, # Recency
'InvoiceNo': 'nunique', # Frequency
'Quantity': 'sum' # Monetary
})
# 重命名列
rfm.rename(columns={'InvoiceDate': 'Recency', 'InvoiceNo': 'Frequency', 'Quantity': 'Monetary'}, inplace=True)
# K-means聚类
kmeans = KMeans(n_clusters=8, random_state=42) # 假设聚为4类
rfm['Cluster'] = kmeans.fit_predict(rfm[['Recency', 'Frequency', 'Monetary']])
# 可视化聚类结果
plt.figure(figsize=(10, 6))
sns.scatterplot(data=rfm, x='Recency', y='Monetary', hue='Cluster', palette='viridis')
plt.title('RFM Clustering')
plt.xlabel('Recency (days)')
plt.ylabel('Monetary')
plt.legend(title='Cluster')
plt.show()
# 显示聚类中心
centers = pd.DataFrame(kmeans.cluster_centers_, columns=['Recency', 'Frequency', 'Monetary'])
print("Cluster Centers:\n", centers)
# 将结果保存为CSV文件
# rfm.to_csv('rfm_kmeans_clustered.csv')
D:\Python310\lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning super()._check_params_vs_input(X, default_n_init=10)
Cluster Centers:
Recency Frequency Monetary
0 26.352941 15.052941 4990.829412
1 5.000000 44.666667 59744.666667
2 109.514447 2.402741 346.787906
3 8.533333 61.400000 29782.733333
4 2.000000 74.000000 197491.000000
5 72.000000 54.200000 76678.000000
6 22.843750 23.625000 12524.562500
7 37.272244 7.464807 1835.606906
In [5]:
# 定义客户类型
customer_types = {
0: "中频高值客户",
1: "高频高值客户",
2: "低频低值客户",
3: "中频高值客户",
4: "极高频高值客户",
5: "高频中值客户",
6: "中频低值客户",
7: "低频低值客户"
}
# 统计每个聚类的客户数量
cluster_counts = rfm['Cluster'].value_counts()
# 输出客户数量和类型
for cluster, count in cluster_counts.items():
print(f"Cluster {cluster} ({customer_types[cluster]}) 客户数量: {count}")
# 获取每个聚类的客户ID
# for i in range(8):
# customer_ids = rfm[rfm['Cluster'] == i].index.tolist()
# print(f"Cluster {i} ({customer_types[i]}) 客户ID:{customer_ids}")
Cluster 2 (低频低值客户) 客户数量: 3351 Cluster 7 (低频低值客户) 客户数量: 757 Cluster 0 (中频高值客户) 客户数量: 172 Cluster 6 (中频低值客户) 客户数量: 32 Cluster 3 (中频高值客户) 客户数量: 15 Cluster 1 (高频高值客户) 客户数量: 6 Cluster 5 (高频中值客户) 客户数量: 5 Cluster 4 (极高频高值客户) 客户数量: 1
销售趋势分析¶
In [6]:
import pandas as pd
import matplotlib.pyplot as plt
# 读取数据
data = pd.read_csv('online_retail.csv')
# 数据清洗
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data = data[data['Quantity'] > 0] # 只保留正数量的记录
# 按月汇总销售额
data['Sales'] = data['Quantity'] * data['UnitPrice']
monthly_sales = data.resample('M', on='InvoiceDate').sum()['Sales']
# 可视化销售趋势
plt.figure(figsize=(12, 6))
monthly_sales.plot()
plt.title('Monthly Sales Trend')
plt.xlabel('Month')
plt.ylabel('Sales Amount')
plt.xticks(rotation=45)
plt.grid()
plt.show()
C:\Users\黄清枫\AppData\Local\Temp\ipykernel_18284\1519722528.py:13: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
monthly_sales = data.resample('M', on='InvoiceDate').sum()['Sales']
产品分析¶
In [9]:
import pandas as pd
import matplotlib.pyplot as plt
# 读取数据
data = pd.read_csv('online_retail.csv')
# 数据清洗
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data = data[data['Quantity'] > 0] # 只保留正数量的记录
# 计算每个产品的销售额
data['Sales'] = data['Quantity'] * data['UnitPrice']
# 计算每个产品的总销售额和销售数量
product_sales = data.groupby('Description').agg({'Quantity': 'sum', 'Sales': 'sum'}).reset_index()
# 按销售数量排序
top_products = product_sales.sort_values(by='Quantity', ascending=False).head(10)
bottom_products = product_sales.sort_values(by='Quantity').head(10)
# 可视化热销产品
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.barh(top_products['Description'], top_products['Quantity'], color='green')
plt.title('Top 10 Best-Selling Products')
plt.xlabel('Sales Quantity')
# 可视化滞销产品
plt.subplot(1, 2, 2)
plt.barh(bottom_products['Description'], bottom_products['Quantity'], color='red')
plt.title('Top 10 Worst-Selling Products')
plt.xlabel('Sales Quantity')
plt.tight_layout()
plt.show()
客户分析¶
In [8]:
import pandas as pd
import matplotlib.pyplot as plt
# 读取数据
data = pd.read_csv('online_retail.csv')
# 数据清洗
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data = data[data['Quantity'] > 0] # 只保留正数量的记录
# 计算RFM值
snapshot_date = data['InvoiceDate'].max() + pd.Timedelta(days=1)
rfm = data.groupby('CustomerID').agg({
'InvoiceDate': lambda x: (snapshot_date - x.max()).days, # Recency
'InvoiceNo': 'nunique', # Frequency
'Quantity': 'sum' # Monetary
}).reset_index()
rfm.rename(columns={'InvoiceDate': 'Recency', 'InvoiceNo': 'Frequency', 'Quantity': 'Monetary'}, inplace=True)
# 可视化RFM值分布
plt.figure(figsize=(12, 6))
plt.subplot(1, 3, 1)
plt.hist(rfm['Recency'], bins=30, color='blue', alpha=0.7)
plt.title('Recency Distribution')
plt.xlabel('Recency (days)')
plt.ylabel('Frequency')
plt.subplot(1, 3, 2)
plt.hist(rfm['Frequency'], bins=30, color='orange', alpha=0.7)
plt.title('Frequency Distribution')
plt.xlabel('Frequency')
plt.subplot(1, 3, 3)
plt.hist(rfm['Monetary'], bins=30, color='green', alpha=0.7)
plt.title('Monetary Distribution')
plt.xlabel('Monetary Value')
plt.tight_layout()
plt.show()
# 输出RFM数据
print(rfm.describe())
CustomerID Recency Frequency Monetary count 4339.000000 4339.000000 4339.000000 4339.000000 mean 15299.936852 92.518322 4.271952 1194.214335 std 1721.889758 100.009747 7.705493 5055.603049 min 12346.000000 1.000000 1.000000 1.000000 25% 13812.500000 18.000000 1.000000 160.000000 50% 15299.000000 51.000000 2.000000 379.000000 75% 16778.500000 142.000000 5.000000 993.000000 max 18287.000000 374.000000 210.000000 197491.000000
购物篮分析¶
In [13]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt
import seaborn as sns
# 读取数据
data = pd.read_csv('online_retail.csv')
# 数据清洗
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data = data[data['Quantity'] > 0] # 只保留正数量的记录
# 创建购物篮数据框
basket = (data
.groupby(['InvoiceNo', 'Description'])['Quantity']
.sum().unstack().reset_index().fillna(0)
.set_index('InvoiceNo'))
# 将数量转化为布尔值,表示是否购买
basket = basket.applymap(lambda x: 1 if x > 0 else 0)
# 使用Apriori算法找出频繁项集,降低支持度阈值
frequent_itemsets = apriori(basket, min_support=0.05, use_colnames=True)
# 生成关联规则
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
# 输出规则
print(rules)
# 可视化前10条规则
plt.figure(figsize=(10, 6))
sns.scatterplot(x='support', y='lift', size='confidence', sizes=(20, 300), data=rules)
plt.title('Association Rules')
plt.xlabel('Support')
plt.ylabel('Lift')
plt.show()
# 出现空数据框的原因可能是支持度(support)阈值设置得太高,导致没有满足条件的频繁项集。我们可以降低支持度阈值,以确保能够找到一些频繁项集。
C:\Users\黄清枫\AppData\Local\Temp\ipykernel_18284\2675874386.py:20: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead. basket = basket.applymap(lambda x: 1 if x > 0 else 0) D:\Python310\lib\site-packages\mlxtend\frequent_patterns\fpcommon.py:109: DeprecationWarning: DataFrames with non-bool types result in worse computationalperformance and their support might be discontinued in the future.Please use a DataFrame with bool type warnings.warn(
Empty DataFrame Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction, zhangs_metric] Index: []
地区分析¶
In [14]:
import pandas as pd
import matplotlib.pyplot as plt
# 读取数据
data = pd.read_csv('online_retail.csv')
# 数据清洗
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data = data[data['Quantity'] > 0] # 只保留正数量的记录
# 计算每个国家的销售额
data['Sales'] = data['Quantity'] * data['UnitPrice']
country_sales = data.groupby('Country').agg({'Sales': 'sum', 'InvoiceNo': 'nunique'}).reset_index()
# 排序
country_sales = country_sales.sort_values(by='Sales', ascending=False)
# 可视化
plt.figure(figsize=(12, 8))
plt.barh(country_sales['Country'].head(10), country_sales['Sales'].head(10), color='skyblue')
plt.title('Top 10 Countries by Sales Amount')
plt.xlabel('Total Sales Amount')
plt.ylabel('Country')
plt.grid(axis='x')
plt.show()
# 输出每个国家的销售额和订单数量
print(country_sales)
Country Sales InvoiceNo 36 United Kingdom 9003097.964 18786 24 Netherlands 285446.340 95 10 EIRE 283453.960 288 14 Germany 228867.140 457 13 France 209715.110 392 0 Australia 138521.310 57 31 Spain 61577.110 90 33 Switzerland 57089.900 54 3 Belgium 41196.340 98 32 Sweden 38378.330 36 20 Japan 37416.370 19 25 Norway 36165.440 36 27 Portugal 33747.100 58 12 Finland 22546.080 41 30 Singapore 21279.290 7 6 Channel Islands 20450.440 26 9 Denmark 18955.340 18 19 Italy 17483.240 38 16 Hong Kong 15691.800 11 7 Cyprus 13590.380 16 1 Austria 10198.680 17 18 Israel 8135.260 8 26 Poland 7334.650 19 15 Greece 4760.520 5 37 Unspecified 4749.790 13 17 Iceland 4310.000 7 5 Canada 3666.380 6 34 USA 3580.390 5 23 Malta 2725.590 5 35 United Arab Emirates 1902.280 3 21 Lebanon 1693.880 1 22 Lithuania 1661.060 4 11 European Community 1300.250 4 4 Brazil 1143.600 1 28 RSA 1002.310 1 8 Czech Republic 826.740 2 2 Bahrain 754.140 3 29 Saudi Arabia 145.920 1
利润分析¶
In [15]:
import pandas as pd
import matplotlib.pyplot as plt
# 读取数据
data = pd.read_csv('online_retail.csv')
# 假设我们有一个产品成本数据集
cost_data = {
'Description': [
'WHITE HANGING HEART T-LIGHT HOLDER',
'WHITE METAL LANTERN',
'CREAM CUPID HEARTS COAT HANGER',
'KNITTED UNION FLAG HOT WATER BOTTLE',
'RED WOOLLY HOTTIE WHITE HEART.',
'SET 7 BABUSHKA NESTING BOXES'
],
'Cost': [1.00, 1.50, 1.20, 2.00, 1.80, 3.50] # 示例成本
}
cost_df = pd.DataFrame(cost_data)
# 数据清洗
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data = data[data['Quantity'] > 0] # 只保留正数量的记录
data['Sales'] = data['Quantity'] * data['UnitPrice']
# 合并成本数据
profit_data = pd.merge(data, cost_df, on='Description', how='left')
profit_data['Profit'] = profit_data['Sales'] - (profit_data['Cost'] * profit_data['Quantity'])
# 按产品汇总利润
product_profit = profit_data.groupby('Description').agg({'Profit': 'sum'}).reset_index()
product_profit = product_profit.sort_values(by='Profit', ascending=False)
# 可视化产品利润
plt.figure(figsize=(12, 6))
plt.barh(product_profit['Description'], product_profit['Profit'], color='skyblue')
plt.title('Product Profit Analysis')
plt.xlabel('Total Profit')
plt.ylabel('Product')
plt.grid(axis='x')
plt.show()
# 输出产品利润数据
print(product_profit)
Description Profit 3860 WHITE HANGING HEART T-LIGHT HOLDER 68397.77 2863 RED WOOLLY HOTTIE WHITE HEART. 13567.98 1871 KNITTED UNION FLAG HOT WATER BOTTLE 9518.04 3110 SET 7 BABUSHKA NESTING BOXES 8188.47 893 CREAM CUPID HEARTS COAT HANGER 5238.12 ... ... ... 1364 FRENCH CHATEAU SMALL FRUITBOWL 0.00 1365 FRENCH ENAMEL CANDLEHOLDER 0.00 1366 FRENCH ENAMEL POT W LID 0.00 1367 FRENCH ENAMEL UTENSIL HOLDER 0.00 4076 wrongly sold (22719) barcode 0.00 [4077 rows x 2 columns]
退货分析¶
In [16]:
import pandas as pd
import matplotlib.pyplot as plt
# 读取数据
data = pd.read_csv('online_retail.csv')
# 数据清洗
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data = data[data['Quantity'] != 0] # 只保留有效的记录
# 计算退货数量
returns = data[data['Quantity'] < 0]
returns['Quantity'] = returns['Quantity'].abs() # 转换为正数
# 计算每个产品的销售数量和退货数量
sales_data = data[data['Quantity'] > 0]
product_sales_returns = sales_data.groupby('Description')['Quantity'].sum().reset_index()
product_returns = returns.groupby('Description')['Quantity'].sum().reset_index()
# 合并销售和退货数据
returns_analysis = pd.merge(product_sales_returns, product_returns, on='Description', how='outer', suffixes=('_Sales', '_Returns'))
returns_analysis.fillna(0, inplace=True)
# 计算退货率
returns_analysis['Return_Rate'] = returns_analysis['Quantity_Returns'] / returns_analysis['Quantity_Sales'].replace(0, 1)
# 排序
returns_analysis = returns_analysis.sort_values(by='Return_Rate', ascending=False)
# 可视化退货率
plt.figure(figsize=(12, 6))
plt.barh(returns_analysis['Description'], returns_analysis['Return_Rate'], color='salmon')
plt.title('Product Return Rate Analysis')
plt.xlabel('Return Rate')
plt.ylabel('Product')
plt.grid(axis='x')
plt.show()
# 输出退货分析数据
print(returns_analysis)
C:\Users\黄清枫\AppData\Local\Temp\ipykernel_18284\1984400613.py:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy returns['Quantity'] = returns['Quantity'].abs() # 转换为正数
Description Quantity_Sales Quantity_Returns \
4167 printing smudges/thrown away 0.0 19200.0
3740 Unsaleable, destroyed. 0.0 15644.0
2796 Printing smudges/thrown away 0.0 9058.0
1123 Damaged 0.0 7540.0
4194 throw away 0.0 5368.0
... ... ... ...
1235 ESSENTIAL BALM 3.5g TIN IN ENVELOPE 5884.0 0.0
1236 ETCHED GLASS COASTER 1618.0 0.0
1237 ETCHED GLASS STAR TREE DECORATION 1.0 0.0
2662 PINK POLKADOT WRAP 6350.0 0.0
0 4 PURPLE FLOCK DINNER CANDLES 144.0 0.0
Return_Rate
4167 19200.0
3740 15644.0
2796 9058.0
1123 7540.0
4194 5368.0
... ...
1235 0.0
1236 0.0
1237 0.0
2662 0.0
0 0.0
[4223 rows x 4 columns]
营销活动效果评估¶
In [17]:
import pandas as pd
import matplotlib.pyplot as plt
# 读取数据
data = pd.read_csv('online_retail.csv')
# 假设我们在数据集中添加了促销活动的字段
# 为了示例,我们使用随机数据生成促销活动
import numpy as np
np.random.seed(0)
data['Promo'] = np.random.choice(['No Promo', 'Promo A', 'Promo B'], size=len(data))
# 数据清洗
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data = data[data['Quantity'] > 0] # 只保留正数量的记录
data['Sales'] = data['Quantity'] * data['UnitPrice']
# 按促销类型汇总销售数据
promo_analysis = data.groupby('Promo').agg({'Sales': 'sum', 'InvoiceNo': 'nunique'}).reset_index()
# 可视化促销效果
plt.figure(figsize=(10, 6))
plt.bar(promo_analysis['Promo'], promo_analysis['Sales'], color='cornflowerblue')
plt.title('Sales Analysis by Promotion Type')
plt.xlabel('Promotion Type')
plt.ylabel('Total Sales Amount')
plt.grid(axis='y')
plt.show()
# 输出促销分析数据
print(promo_analysis)
Promo Sales InvoiceNo 0 No Promo 3573185.841 18181 1 Promo A 3587804.551 18197 2 Promo B 3483570.032 18148
输出RFM数据到CSV¶
In [18]:
import pandas as pd
# 读取数据
data = pd.read_csv('online_retail.csv')
# 数据清洗
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data = data[data['Quantity'] > 0] # 只保留正数量的记录
# 计算销售额
data['Sales'] = data['Quantity'] * data['UnitPrice']
# 计算RFM指标
snapshot_date = data['InvoiceDate'].max() + pd.DateOffset(days=1)
rfm = data.groupby('CustomerID').agg({
'InvoiceDate': lambda x: (snapshot_date - x.max()).days, # 计算Recency
'InvoiceNo': 'count', # 计算Frequency
'Sales': 'sum' # 计算Monetary
}).reset_index()
# 重命名列
rfm.columns = ['CustomerID', 'Recency', 'Frequency', 'Monetary']
# 输出RFM数据到CSV
rfm.to_csv('rfm_analysis.csv', index=False)
# 输出结果确认
print(rfm.head())
CustomerID Recency Frequency Monetary 0 12346.0 326 1 77183.60 1 12347.0 2 182 4310.00 2 12348.0 75 31 1797.24 3 12349.0 19 73 1757.55 4 12350.0 310 17 334.40
In [ ]: