import numpy as np

import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)


# 加载数据

df = pd.read_csv("gym_members_exercise_tracking_synthetic_data.csv")

df.head()

df.shape

(1800, 15)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1800 entries, 0 to 1799
Data columns (total 15 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Age                            1790 non-null   float64
 1   Gender                         1729 non-null   object 
 2   Weight (kg)                    1778 non-null   float64
 3   Height (m)                     1774 non-null   float64
 4   Max_BPM                        1779 non-null   object 
 5   Avg_BPM                        1770 non-null   float64
 6   Resting_BPM                    1781 non-null   float64
 7   Session_Duration (hours)       1777 non-null   float64
 8   Calories_Burned                1777 non-null   float64
 9   Workout_Type                   1739 non-null   object 
 10  Fat_Percentage                 1784 non-null   float64
 11  Water_Intake (liters)          1776 non-null   float64
 12  Workout_Frequency (days/week)  1742 non-null   float64
 13  Experience_Level               1743 non-null   float64
 14  BMI                            1770 non-null   float64
dtypes: float64(12), object(3)
memory usage: 211.1+ KB

df.describe().T

df.isnull().sum()

Age                              10
Gender                           71
Weight (kg)                      22
Height (m)                       26
Max_BPM                          21
Avg_BPM                          30
Resting_BPM                      19
Session_Duration (hours)         23
Calories_Burned                  23
Workout_Type                     61
Fat_Percentage                   16
Water_Intake (liters)            24
Workout_Frequency (days/week)    58
Experience_Level                 57
BMI                              30
dtype: int64

df.duplicated().sum()

0

gender_counts = df["Gender"].value_counts()

plt.figure()

plt.pie(

gender_counts,

labels=gender_counts.index,

autopct='%1.1f%%',

startangle=90            

)

plt.title("Gender distribution")

plt.show()

plt.figure(figsize=(10, 6))

sns.lineplot(data=df, x='Age', y='Max_BPM', label='Max BPM', color='b')

plt.title('Relationship between Age, Max BPM', fontsize=16)

plt.xlabel('Age', fontsize=12)

plt.ylabel('Values', fontsize=12)

plt.legend()

plt.show()

plt.figure(figsize=(10, 6))

sns.lineplot(data=df, x='Age', y='Calories_Burned', label='Calories Burned', color='g')

plt.title('Relationship between Age and Calories Burned', fontsize=16)

plt.xlabel('Age', fontsize=12)

plt.ylabel('Values', fontsize=12)

plt.legend()

plt.show()

plt.figure(figsize=(12, 6))

sns.lineplot(data=df, x='Session_Duration (hours)', y='Calories_Burned', hue='Experience_Level', marker='o')

plt.title('Calories Burned by Experience Level over Time', fontsize=16)

plt.xlabel('Session Duration (hours)', fontsize=12)

plt.ylabel('Calories Burned', fontsize=12)

plt.show()

age_weight_mean = df.groupby("Age")["Weight (kg)"].mean().reset_index(name="Average weight")

overall_mean = age_weight_mean["Average weight"].mean()

plt.figure(figsize=(10, 6))

sns.lineplot(data=age_weight_mean, x='Age', y='Average weight', markers=True)

plt.axhline(overall_mean, color='red', linestyle='--', linewidth=2, label=f'Average: {overall_mean:.2f} kg')

plt.text(

age_weight_mean["Age"].max(),

overall_mean - 1,

f"Average: {overall_mean:.2f} kg",

color='red',

fontsize=12

)

plt.title("Average weight by age")

plt.xlabel("Age")

plt.ylabel("Average weight (kg)")

plt.legend()

plt.show()

bins = [0, 25, 35, 45, 60]

labels = ['18-25', '26-35', '36-45', '46-60']

df['Age Group'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)

plt.figure(figsize=(8, 5))

df['Age Group'].value_counts(sort=False).plot(kind='bar', color='skyblue', edgecolor='black')

plt.title("Age Group Distribution")

plt.xlabel("Age Group")

plt.ylabel("Count")

plt.xticks(rotation=0)

plt.show()

plt.figure(figsize=(10, 6))

sns.histplot(df['Weight (kg)'], bins=15, kde=True, color='skyblue', edgecolor='black')

plt.title("Weight Distribution (kg)", fontsize=16, fontweight='bold')

plt.xlabel("Weight (kg)", fontsize=12)

plt.ylabel("Frequency", fontsize=12)

plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()

plt.show()

plt.figure(figsize=(10, 6))

sns.scatterplot(x='Water_Intake (liters)', y='Calories_Burned', data=df, color='purple')

plt.title('Water Intake vs Calories Burned', fontsize=16)

plt.xlabel('Water Intake (liters)', fontsize=12)

plt.ylabel('Calories Burned', fontsize=12)

plt.show()

plt.figure(figsize=(10, 6))

sns.countplot(data=df, x='Experience_Level', hue='Gender', palette='Set1')

plt.title('Gender Distribution by Experience Level', fontsize=16)

plt.xlabel('Experience Level', fontsize=12)

plt.ylabel('Count of Members', fontsize=12)

plt.show()

plt.figure(figsize=(10, 6))

sns.countplot(data=df, x='Workout_Type', hue='Gender', palette='Set2')

plt.title('Gender Distribution by Workout type', fontsize=16)

plt.xlabel('Workout type', fontsize=12)

plt.ylabel('Count of Members', fontsize=12)

plt.show()

# 确保所有列是数值型
df[['Max_BPM', 'Avg_BPM', 'Resting_BPM']] = df[['Max_BPM', 'Avg_BPM', 'Resting_BPM']].apply(pd.to_numeric, errors='coerce')
df['Workout_Frequency (days/week)'] = pd.to_numeric(df['Workout_Frequency (days/week)'], errors='coerce')

# 删除 NaN 值
df.dropna(subset=['Max_BPM', 'Avg_BPM', 'Resting_BPM'], inplace=True)

# 分组求均值
bpm_by_workout = df.groupby('Workout_Frequency (days/week)')[['Max_BPM', 'Avg_BPM', 'Resting_BPM']].mean()

# 画柱状图
bpm_by_workout.plot(kind='bar', figsize=(12, 6), color=['lightblue', 'lightgreen', 'lightcoral'])

plt.title("Average BPM by Workout Frequency", fontsize=16, fontweight='bold')
plt.xlabel("Workout Frequency (Days/Week)", fontsize=12)
plt.ylabel("BPM", fontsize=12)
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))

sns.scatterplot(x='Session_Duration (hours)', y='Calories_Burned', data=df, color='purple', alpha=0.7)

plt.title("Calories Burned vs Session Duration", fontsize=16, fontweight='bold')

plt.xlabel("Session Duration (hours)", fontsize=12)

plt.ylabel("Calories Burned", fontsize=12)

plt.grid(True)

plt.tight_layout()

plt.show()

plt.figure(figsize=(10, 6))

sns.regplot(data=df, x='Session_Duration (hours)', y='Max_BPM', scatter_kws={'alpha':0.6}, line_kws={'color':'red'})

plt.title('Max BPM vs Session Duration with Trend Line', fontsize=16)

plt.xlabel('Session Duration (hours)', fontsize=12)

plt.ylabel('Max BPM', fontsize=12)

plt.show()

workout_type_counts = df['Workout_Type'].value_counts()

plt.figure(figsize=(6, 6))

# 使用更加突出的颜色调色板和阴影效果
workout_type_counts.plot(kind='pie', autopct='%1.1f%%', colors=sns.color_palette('coolwarm', len(workout_type_counts)), 
                         wedgeprops={'edgecolor': 'black', 'linewidth': 1.5}, shadow=True,pctdistance=1.5)

# 设置标题的样式
plt.title("Distribution of Workout Types", fontsize=18, fontweight='bold', color='darkblue')

# 移除y标签
plt.ylabel("")

# 调整布局以避免文字重叠
plt.tight_layout()

# 显示图形
plt.show()

plt.figure(figsize=(12, 6))

sns.scatterplot(x='Session_Duration (hours)', y='Calories_Burned', hue='Experience_Level', data=df, palette='coolwarm', alpha=0.7)

plt.title("Calories Burned vs Session Duration by Experience Level", fontsize=16, fontweight='bold')

plt.xlabel("Session Duration (hours)", fontsize=12)

plt.ylabel("Calories Burned", fontsize=12)

plt.legend(title="Experience Level", bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()

plt.show()

plt.figure(figsize=(12, 6))

sns.scatterplot(x='BMI', y='Fat_Percentage', hue='Height (m)', data=df, palette='viridis', size='Height (m)', sizes=(20, 200), alpha=0.7)

plt.title("BMI vs Fat Percentage vs Height", fontsize=16, fontweight='bold')

plt.xlabel("BMI", fontsize=12)

plt.ylabel("Fat Percentage", fontsize=12)

plt.legend(title="Height", bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()

plt.show()

# 使用 get_dummies 将性别转换为虚拟变量
df_encoded = pd.get_dummies(df, columns=['Gender'], drop_first=True)

# 重新计算相关性矩阵
numeric_columns = df_encoded.select_dtypes(include=['number'])
corr_matrix = numeric_columns.corr()

# 绘制热力图
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap of Numerical Features (With Encoded Gender)', fontsize=16)
plt.show()

	Age	Gender	Weight (kg)	Height (m)	Max_BPM	Avg_BPM	Resting_BPM	Session_Duration (hours)	Calories_Burned	Workout_Type	Fat_Percentage	Water_Intake (liters)	Workout_Frequency (days/week)	Experience_Level	BMI
0	34.0	Female	86.7	1.86	174	152.0	74.0	1.12	712.0	Strength	12.8	2.4	5.0	2.0	14.31
1	26.0	Female	84.7	1.83	166	156.0	73.0	1.00	833.0	Strength	27.9	2.8	5.0	2.0	33.49
2	22.0	Male	64.8	1.85	187	166.0	64.0	1.24	1678.0	Cardio	28.7	1.9	3.0	2.0	12.73
3	54.0	Female	75.3	1.82	187	169.0	58.0	1.45	628.0	Cardio	31.8	2.4	4.0	1.0	20.37
4	34.0	Female	52.8	1.74	177	169.0	66.0	1.60	1286.0	Strength	26.4	3.2	4.0	2.0	20.83

	count	mean	std	min	25%	50%	75%	max
Age	1790.0	34.475419	12.315728	18.00	23.00	33.00	45.0000	59.00
Weight (kg)	1778.0	67.606637	19.861408	40.00	52.30	65.30	80.1000	129.90
Height (m)	1774.0	1.739233	0.123892	1.50	1.65	1.73	1.8200	2.00
Avg_BPM	1770.0	146.259322	15.219516	120.00	133.25	146.00	160.0000	169.00
Resting_BPM	1781.0	63.951151	7.936502	50.00	57.00	65.00	72.0000	74.00
Session_Duration (hours)	1777.0	1.391598	0.372342	0.50	1.13	1.37	1.6400	2.00
Calories_Burned	1777.0	1033.698931	328.049662	303.00	794.00	1030.00	1249.0000	1783.00
Fat_Percentage	1784.0	23.509361	5.865557	10.00	20.50	24.30	27.5000	35.00
Water_Intake (liters)	1776.0	2.706644	0.710224	1.50	2.10	2.80	3.4000	3.70
Workout_Frequency (days/week)	1742.0	3.339265	0.947955	2.00	3.00	3.00	4.0000	5.00
Experience_Level	1743.0	1.823867	0.746568	1.00	1.00	2.00	2.0000	3.00
BMI	1770.0	19.957774	6.573175	12.32	14.78	18.69	23.5175	49.84

📘 健身房会员锻炼数据集可视化分析/gym.ipynb