import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
# 读取数据
data = pd.read_csv('online_retail.csv')
# 数据清洗
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data = data[data['Quantity'] > 0]  # 只保留正数量的记录
# 计算RFM值
snapshot_date = data['InvoiceDate'].max() + pd.Timedelta(days=1)
rfm = data.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (snapshot_date - x.max()).days,  # Recency
    'InvoiceNo': 'nunique',  # Frequency
    'Quantity': 'sum'  # Monetary
})
# 重命名列
rfm.rename(columns={'InvoiceDate': 'Recency', 'InvoiceNo': 'Frequency', 'Quantity': 'Monetary'}, inplace=True)
# K-means聚类
kmeans = KMeans(n_clusters=8, random_state=42)  # 假设聚为4类
rfm['Cluster'] = kmeans.fit_predict(rfm[['Recency', 'Frequency', 'Monetary']])
# 可视化聚类结果
plt.figure(figsize=(10, 6))
sns.scatterplot(data=rfm, x='Recency', y='Monetary', hue='Cluster', palette='viridis')
plt.title('RFM Clustering')
plt.xlabel('Recency (days)')
plt.ylabel('Monetary')
plt.legend(title='Cluster')
plt.show()
# 显示聚类中心
centers = pd.DataFrame(kmeans.cluster_centers_, columns=['Recency', 'Frequency', 'Monetary'])
print("Cluster Centers:\n", centers)
# 将结果保存为CSV文件
# rfm.to_csv('rfm_kmeans_clustered.csv')

D:\Python310\lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)

Cluster Centers:
       Recency  Frequency       Monetary
0   26.352941  15.052941    4990.829412
1    5.000000  44.666667   59744.666667
2  109.514447   2.402741     346.787906
3    8.533333  61.400000   29782.733333
4    2.000000  74.000000  197491.000000
5   72.000000  54.200000   76678.000000
6   22.843750  23.625000   12524.562500
7   37.272244   7.464807    1835.606906

# 定义客户类型
customer_types = {
    0: "中频高值客户",
    1: "高频高值客户",
    2: "低频低值客户",
    3: "中频高值客户",
    4: "极高频高值客户",
    5: "高频中值客户",
    6: "中频低值客户",
    7: "低频低值客户"
}
# 统计每个聚类的客户数量
cluster_counts = rfm['Cluster'].value_counts()
# 输出客户数量和类型
for cluster, count in cluster_counts.items():
    print(f"Cluster {cluster} ({customer_types[cluster]}) 客户数量: {count}")
# 获取每个聚类的客户ID
# for i in range(8):
#     customer_ids = rfm[rfm['Cluster'] == i].index.tolist()
#     print(f"Cluster {i} ({customer_types[i]}) 客户ID：{customer_ids}")

Cluster 2 (低频低值客户) 客户数量: 3351
Cluster 7 (低频低值客户) 客户数量: 757
Cluster 0 (中频高值客户) 客户数量: 172
Cluster 6 (中频低值客户) 客户数量: 32
Cluster 3 (中频高值客户) 客户数量: 15
Cluster 1 (高频高值客户) 客户数量: 6
Cluster 5 (高频中值客户) 客户数量: 5
Cluster 4 (极高频高值客户) 客户数量: 1

import pandas as pd
import matplotlib.pyplot as plt
# 读取数据
data = pd.read_csv('online_retail.csv')
# 数据清洗
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data = data[data['Quantity'] > 0]  # 只保留正数量的记录
# 按月汇总销售额
data['Sales'] = data['Quantity'] * data['UnitPrice']
monthly_sales = data.resample('M', on='InvoiceDate').sum()['Sales']
# 可视化销售趋势
plt.figure(figsize=(12, 6))
monthly_sales.plot()
plt.title('Monthly Sales Trend')
plt.xlabel('Month')
plt.ylabel('Sales Amount')
plt.xticks(rotation=45)
plt.grid()
plt.show()

C:\Users\黄清枫\AppData\Local\Temp\ipykernel_18284\1519722528.py:13: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
  monthly_sales = data.resample('M', on='InvoiceDate').sum()['Sales']

import pandas as pd
import matplotlib.pyplot as plt
# 读取数据
data = pd.read_csv('online_retail.csv')
# 数据清洗
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data = data[data['Quantity'] > 0]  # 只保留正数量的记录
# 计算每个产品的销售额
data['Sales'] = data['Quantity'] * data['UnitPrice']
# 计算每个产品的总销售额和销售数量
product_sales = data.groupby('Description').agg({'Quantity': 'sum', 'Sales': 'sum'}).reset_index()
# 按销售数量排序
top_products = product_sales.sort_values(by='Quantity', ascending=False).head(10)
bottom_products = product_sales.sort_values(by='Quantity').head(10)
# 可视化热销产品
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.barh(top_products['Description'], top_products['Quantity'], color='green')
plt.title('Top 10 Best-Selling Products')
plt.xlabel('Sales Quantity')
# 可视化滞销产品
plt.subplot(1, 2, 2)
plt.barh(bottom_products['Description'], bottom_products['Quantity'], color='red')
plt.title('Top 10 Worst-Selling Products')
plt.xlabel('Sales Quantity')
plt.tight_layout()
plt.show()

import pandas as pd
import matplotlib.pyplot as plt
# 读取数据
data = pd.read_csv('online_retail.csv')
# 数据清洗
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data = data[data['Quantity'] > 0]  # 只保留正数量的记录
# 计算RFM值
snapshot_date = data['InvoiceDate'].max() + pd.Timedelta(days=1)
rfm = data.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (snapshot_date - x.max()).days,  # Recency
    'InvoiceNo': 'nunique',  # Frequency
    'Quantity': 'sum'  # Monetary
}).reset_index()
rfm.rename(columns={'InvoiceDate': 'Recency', 'InvoiceNo': 'Frequency', 'Quantity': 'Monetary'}, inplace=True)
# 可视化RFM值分布
plt.figure(figsize=(12, 6))
plt.subplot(1, 3, 1)
plt.hist(rfm['Recency'], bins=30, color='blue', alpha=0.7)
plt.title('Recency Distribution')
plt.xlabel('Recency (days)')
plt.ylabel('Frequency')
plt.subplot(1, 3, 2)
plt.hist(rfm['Frequency'], bins=30, color='orange', alpha=0.7)
plt.title('Frequency Distribution')
plt.xlabel('Frequency')
plt.subplot(1, 3, 3)
plt.hist(rfm['Monetary'], bins=30, color='green', alpha=0.7)
plt.title('Monetary Distribution')
plt.xlabel('Monetary Value')
plt.tight_layout()
plt.show()
# 输出RFM数据
print(rfm.describe())

         CustomerID      Recency    Frequency       Monetary
count   4339.000000  4339.000000  4339.000000    4339.000000
mean   15299.936852    92.518322     4.271952    1194.214335
std     1721.889758   100.009747     7.705493    5055.603049
min    12346.000000     1.000000     1.000000       1.000000
25%    13812.500000    18.000000     1.000000     160.000000
50%    15299.000000    51.000000     2.000000     379.000000
75%    16778.500000   142.000000     5.000000     993.000000
max    18287.000000   374.000000   210.000000  197491.000000

import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.pyplot as plt
import seaborn as sns
# 读取数据
data = pd.read_csv('online_retail.csv')
# 数据清洗
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data = data[data['Quantity'] > 0]  # 只保留正数量的记录
# 创建购物篮数据框
basket = (data
          .groupby(['InvoiceNo', 'Description'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))
# 将数量转化为布尔值，表示是否购买
basket = basket.applymap(lambda x: 1 if x > 0 else 0)
# 使用Apriori算法找出频繁项集，降低支持度阈值
frequent_itemsets = apriori(basket, min_support=0.05, use_colnames=True)
# 生成关联规则
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
# 输出规则
print(rules)
# 可视化前10条规则
plt.figure(figsize=(10, 6))
sns.scatterplot(x='support', y='lift', size='confidence', sizes=(20, 300), data=rules)
plt.title('Association Rules')
plt.xlabel('Support')
plt.ylabel('Lift')
plt.show()
# 出现空数据框的原因可能是支持度（support）阈值设置得太高，导致没有满足条件的频繁项集。我们可以降低支持度阈值，以确保能够找到一些频繁项集。

C:\Users\黄清枫\AppData\Local\Temp\ipykernel_18284\2675874386.py:20: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
  basket = basket.applymap(lambda x: 1 if x > 0 else 0)
D:\Python310\lib\site-packages\mlxtend\frequent_patterns\fpcommon.py:109: DeprecationWarning: DataFrames with non-bool types result in worse computationalperformance and their support might be discontinued in the future.Please use a DataFrame with bool type
  warnings.warn(

Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, leverage, conviction, zhangs_metric]
Index: []

import pandas as pd
import matplotlib.pyplot as plt
# 读取数据
data = pd.read_csv('online_retail.csv')
# 数据清洗
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data = data[data['Quantity'] > 0]  # 只保留正数量的记录
# 计算每个国家的销售额
data['Sales'] = data['Quantity'] * data['UnitPrice']
country_sales = data.groupby('Country').agg({'Sales': 'sum', 'InvoiceNo': 'nunique'}).reset_index()
# 排序
country_sales = country_sales.sort_values(by='Sales', ascending=False)
# 可视化
plt.figure(figsize=(12, 8))
plt.barh(country_sales['Country'].head(10), country_sales['Sales'].head(10), color='skyblue')
plt.title('Top 10 Countries by Sales Amount')
plt.xlabel('Total Sales Amount')
plt.ylabel('Country')
plt.grid(axis='x')
plt.show()
# 输出每个国家的销售额和订单数量
print(country_sales)

                 Country        Sales  InvoiceNo
36        United Kingdom  9003097.964      18786
24           Netherlands   285446.340         95
10                  EIRE   283453.960        288
14               Germany   228867.140        457
13                France   209715.110        392
0              Australia   138521.310         57
31                 Spain    61577.110         90
33           Switzerland    57089.900         54
3                Belgium    41196.340         98
32                Sweden    38378.330         36
20                 Japan    37416.370         19
25                Norway    36165.440         36
27              Portugal    33747.100         58
12               Finland    22546.080         41
30             Singapore    21279.290          7
6        Channel Islands    20450.440         26
9                Denmark    18955.340         18
19                 Italy    17483.240         38
16             Hong Kong    15691.800         11
7                 Cyprus    13590.380         16
1                Austria    10198.680         17
18                Israel     8135.260          8
26                Poland     7334.650         19
15                Greece     4760.520          5
37           Unspecified     4749.790         13
17               Iceland     4310.000          7
5                 Canada     3666.380          6
34                   USA     3580.390          5
23                 Malta     2725.590          5
35  United Arab Emirates     1902.280          3
21               Lebanon     1693.880          1
22             Lithuania     1661.060          4
11    European Community     1300.250          4
4                 Brazil     1143.600          1
28                   RSA     1002.310          1
8         Czech Republic      826.740          2
2                Bahrain      754.140          3
29          Saudi Arabia      145.920          1

import pandas as pd
import matplotlib.pyplot as plt
# 读取数据
data = pd.read_csv('online_retail.csv')
# 假设我们有一个产品成本数据集
cost_data = {
    'Description': [
        'WHITE HANGING HEART T-LIGHT HOLDER', 
        'WHITE METAL LANTERN', 
        'CREAM CUPID HEARTS COAT HANGER',
        'KNITTED UNION FLAG HOT WATER BOTTLE',
        'RED WOOLLY HOTTIE WHITE HEART.',
        'SET 7 BABUSHKA NESTING BOXES'
    ],
    'Cost': [1.00, 1.50, 1.20, 2.00, 1.80, 3.50]  # 示例成本
}
cost_df = pd.DataFrame(cost_data)
# 数据清洗
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data = data[data['Quantity'] > 0]  # 只保留正数量的记录
data['Sales'] = data['Quantity'] * data['UnitPrice']
# 合并成本数据
profit_data = pd.merge(data, cost_df, on='Description', how='left')
profit_data['Profit'] = profit_data['Sales'] - (profit_data['Cost'] * profit_data['Quantity'])
# 按产品汇总利润
product_profit = profit_data.groupby('Description').agg({'Profit': 'sum'}).reset_index()
product_profit = product_profit.sort_values(by='Profit', ascending=False)
# 可视化产品利润
plt.figure(figsize=(12, 6))
plt.barh(product_profit['Description'], product_profit['Profit'], color='skyblue')
plt.title('Product Profit Analysis')
plt.xlabel('Total Profit')
plt.ylabel('Product')
plt.grid(axis='x')
plt.show()
# 输出产品利润数据
print(product_profit)

                              Description    Profit
3860   WHITE HANGING HEART T-LIGHT HOLDER  68397.77
2863       RED WOOLLY HOTTIE WHITE HEART.  13567.98
1871  KNITTED UNION FLAG HOT WATER BOTTLE   9518.04
3110         SET 7 BABUSHKA NESTING BOXES   8188.47
893        CREAM CUPID HEARTS COAT HANGER   5238.12
...                                   ...       ...
1364       FRENCH CHATEAU SMALL FRUITBOWL      0.00
1365           FRENCH ENAMEL CANDLEHOLDER      0.00
1366              FRENCH ENAMEL POT W LID      0.00
1367         FRENCH ENAMEL UTENSIL HOLDER      0.00
4076         wrongly sold (22719) barcode      0.00

[4077 rows x 2 columns]

import pandas as pd
import matplotlib.pyplot as plt
# 读取数据
data = pd.read_csv('online_retail.csv')
# 数据清洗
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data = data[data['Quantity'] != 0]  # 只保留有效的记录
# 计算退货数量
returns = data[data['Quantity'] < 0]
returns['Quantity'] = returns['Quantity'].abs()  # 转换为正数
# 计算每个产品的销售数量和退货数量
sales_data = data[data['Quantity'] > 0]
product_sales_returns = sales_data.groupby('Description')['Quantity'].sum().reset_index()
product_returns = returns.groupby('Description')['Quantity'].sum().reset_index()
# 合并销售和退货数据
returns_analysis = pd.merge(product_sales_returns, product_returns, on='Description', how='outer', suffixes=('_Sales', '_Returns'))
returns_analysis.fillna(0, inplace=True)
# 计算退货率
returns_analysis['Return_Rate'] = returns_analysis['Quantity_Returns'] / returns_analysis['Quantity_Sales'].replace(0, 1)
# 排序
returns_analysis = returns_analysis.sort_values(by='Return_Rate', ascending=False)
# 可视化退货率
plt.figure(figsize=(12, 6))
plt.barh(returns_analysis['Description'], returns_analysis['Return_Rate'], color='salmon')
plt.title('Product Return Rate Analysis')
plt.xlabel('Return Rate')
plt.ylabel('Product')
plt.grid(axis='x')
plt.show()
# 输出退货分析数据
print(returns_analysis)

C:\Users\黄清枫\AppData\Local\Temp\ipykernel_18284\1984400613.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  returns['Quantity'] = returns['Quantity'].abs()  # 转换为正数

                              Description  Quantity_Sales  Quantity_Returns  \
4167         printing smudges/thrown away             0.0           19200.0   
3740               Unsaleable, destroyed.             0.0           15644.0   
2796         Printing smudges/thrown away             0.0            9058.0   
1123                              Damaged             0.0            7540.0   
4194                           throw away             0.0            5368.0   
...                                   ...             ...               ...   
1235  ESSENTIAL BALM 3.5g TIN IN ENVELOPE          5884.0               0.0   
1236                 ETCHED GLASS COASTER          1618.0               0.0   
1237    ETCHED GLASS STAR TREE DECORATION             1.0               0.0   
2662                  PINK POLKADOT WRAP           6350.0               0.0   
0           4 PURPLE FLOCK DINNER CANDLES           144.0               0.0   

      Return_Rate  
4167      19200.0  
3740      15644.0  
2796       9058.0  
1123       7540.0  
4194       5368.0  
...           ...  
1235          0.0  
1236          0.0  
1237          0.0  
2662          0.0  
0             0.0  

[4223 rows x 4 columns]

import pandas as pd
import matplotlib.pyplot as plt
# 读取数据
data = pd.read_csv('online_retail.csv')
# 假设我们在数据集中添加了促销活动的字段
# 为了示例，我们使用随机数据生成促销活动
import numpy as np
np.random.seed(0)
data['Promo'] = np.random.choice(['No Promo', 'Promo A', 'Promo B'], size=len(data))
# 数据清洗
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data = data[data['Quantity'] > 0]  # 只保留正数量的记录
data['Sales'] = data['Quantity'] * data['UnitPrice']
# 按促销类型汇总销售数据
promo_analysis = data.groupby('Promo').agg({'Sales': 'sum', 'InvoiceNo': 'nunique'}).reset_index()
# 可视化促销效果
plt.figure(figsize=(10, 6))
plt.bar(promo_analysis['Promo'], promo_analysis['Sales'], color='cornflowerblue')
plt.title('Sales Analysis by Promotion Type')
plt.xlabel('Promotion Type')
plt.ylabel('Total Sales Amount')
plt.grid(axis='y')
plt.show()
# 输出促销分析数据
print(promo_analysis)

      Promo        Sales  InvoiceNo
0  No Promo  3573185.841      18181
1   Promo A  3587804.551      18197
2   Promo B  3483570.032      18148

import pandas as pd
# 读取数据
data = pd.read_csv('online_retail.csv')
# 数据清洗
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data = data[data['Quantity'] > 0]  # 只保留正数量的记录
# 计算销售额
data['Sales'] = data['Quantity'] * data['UnitPrice']
# 计算RFM指标
snapshot_date = data['InvoiceDate'].max() + pd.DateOffset(days=1)
rfm = data.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (snapshot_date - x.max()).days,  # 计算Recency
    'InvoiceNo': 'count',  # 计算Frequency
    'Sales': 'sum'  # 计算Monetary
}).reset_index()
# 重命名列
rfm.columns = ['CustomerID', 'Recency', 'Frequency', 'Monetary']
# 输出RFM数据到CSV
rfm.to_csv('rfm_analysis.csv', index=False)
# 输出结果确认
print(rfm.head())

   CustomerID  Recency  Frequency  Monetary
0     12346.0      326          1  77183.60
1     12347.0        2        182   4310.00
2     12348.0       75         31   1797.24
3     12349.0       19         73   1757.55
4     12350.0      310         17    334.40

📘 基于RFM+Kmeans的电商客户价值聚类分析/RFM.ipynb

销售趋势分析¶

产品分析¶

客户分析¶

购物篮分析¶

地区分析¶

利润分析¶

退货分析¶

营销活动效果评估¶

输出RFM数据到CSV¶