In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# 加载数据
df = pd.read_csv("gym_members_exercise_tracking_synthetic_data.csv")
df.head()
Out[1]:
| Age | Gender | Weight (kg) | Height (m) | Max_BPM | Avg_BPM | Resting_BPM | Session_Duration (hours) | Calories_Burned | Workout_Type | Fat_Percentage | Water_Intake (liters) | Workout_Frequency (days/week) | Experience_Level | BMI | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 34.0 | Female | 86.7 | 1.86 | 174 | 152.0 | 74.0 | 1.12 | 712.0 | Strength | 12.8 | 2.4 | 5.0 | 2.0 | 14.31 |
| 1 | 26.0 | Female | 84.7 | 1.83 | 166 | 156.0 | 73.0 | 1.00 | 833.0 | Strength | 27.9 | 2.8 | 5.0 | 2.0 | 33.49 |
| 2 | 22.0 | Male | 64.8 | 1.85 | 187 | 166.0 | 64.0 | 1.24 | 1678.0 | Cardio | 28.7 | 1.9 | 3.0 | 2.0 | 12.73 |
| 3 | 54.0 | Female | 75.3 | 1.82 | 187 | 169.0 | 58.0 | 1.45 | 628.0 | Cardio | 31.8 | 2.4 | 4.0 | 1.0 | 20.37 |
| 4 | 34.0 | Female | 52.8 | 1.74 | 177 | 169.0 | 66.0 | 1.60 | 1286.0 | Strength | 26.4 | 3.2 | 4.0 | 2.0 | 20.83 |
In [2]:
df.shape
Out[2]:
(1800, 15)
In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1800 entries, 0 to 1799 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 1790 non-null float64 1 Gender 1729 non-null object 2 Weight (kg) 1778 non-null float64 3 Height (m) 1774 non-null float64 4 Max_BPM 1779 non-null object 5 Avg_BPM 1770 non-null float64 6 Resting_BPM 1781 non-null float64 7 Session_Duration (hours) 1777 non-null float64 8 Calories_Burned 1777 non-null float64 9 Workout_Type 1739 non-null object 10 Fat_Percentage 1784 non-null float64 11 Water_Intake (liters) 1776 non-null float64 12 Workout_Frequency (days/week) 1742 non-null float64 13 Experience_Level 1743 non-null float64 14 BMI 1770 non-null float64 dtypes: float64(12), object(3) memory usage: 211.1+ KB
In [4]:
df.describe().T
Out[4]:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Age | 1790.0 | 34.475419 | 12.315728 | 18.00 | 23.00 | 33.00 | 45.0000 | 59.00 |
| Weight (kg) | 1778.0 | 67.606637 | 19.861408 | 40.00 | 52.30 | 65.30 | 80.1000 | 129.90 |
| Height (m) | 1774.0 | 1.739233 | 0.123892 | 1.50 | 1.65 | 1.73 | 1.8200 | 2.00 |
| Avg_BPM | 1770.0 | 146.259322 | 15.219516 | 120.00 | 133.25 | 146.00 | 160.0000 | 169.00 |
| Resting_BPM | 1781.0 | 63.951151 | 7.936502 | 50.00 | 57.00 | 65.00 | 72.0000 | 74.00 |
| Session_Duration (hours) | 1777.0 | 1.391598 | 0.372342 | 0.50 | 1.13 | 1.37 | 1.6400 | 2.00 |
| Calories_Burned | 1777.0 | 1033.698931 | 328.049662 | 303.00 | 794.00 | 1030.00 | 1249.0000 | 1783.00 |
| Fat_Percentage | 1784.0 | 23.509361 | 5.865557 | 10.00 | 20.50 | 24.30 | 27.5000 | 35.00 |
| Water_Intake (liters) | 1776.0 | 2.706644 | 0.710224 | 1.50 | 2.10 | 2.80 | 3.4000 | 3.70 |
| Workout_Frequency (days/week) | 1742.0 | 3.339265 | 0.947955 | 2.00 | 3.00 | 3.00 | 4.0000 | 5.00 |
| Experience_Level | 1743.0 | 1.823867 | 0.746568 | 1.00 | 1.00 | 2.00 | 2.0000 | 3.00 |
| BMI | 1770.0 | 19.957774 | 6.573175 | 12.32 | 14.78 | 18.69 | 23.5175 | 49.84 |
In [5]:
df.isnull().sum()
Out[5]:
Age 10 Gender 71 Weight (kg) 22 Height (m) 26 Max_BPM 21 Avg_BPM 30 Resting_BPM 19 Session_Duration (hours) 23 Calories_Burned 23 Workout_Type 61 Fat_Percentage 16 Water_Intake (liters) 24 Workout_Frequency (days/week) 58 Experience_Level 57 BMI 30 dtype: int64
In [6]:
df.duplicated().sum()
Out[6]:
0
In [7]:
gender_counts = df["Gender"].value_counts()
plt.figure()
plt.pie(
gender_counts,
labels=gender_counts.index,
autopct='%1.1f%%',
startangle=90
)
plt.title("Gender distribution")
plt.show()
In [8]:
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='Age', y='Max_BPM', label='Max BPM', color='b')
plt.title('Relationship between Age, Max BPM', fontsize=16)
plt.xlabel('Age', fontsize=12)
plt.ylabel('Values', fontsize=12)
plt.legend()
plt.show()
In [9]:
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='Age', y='Calories_Burned', label='Calories Burned', color='g')
plt.title('Relationship between Age and Calories Burned', fontsize=16)
plt.xlabel('Age', fontsize=12)
plt.ylabel('Values', fontsize=12)
plt.legend()
plt.show()
In [10]:
plt.figure(figsize=(12, 6))
sns.lineplot(data=df, x='Session_Duration (hours)', y='Calories_Burned', hue='Experience_Level', marker='o')
plt.title('Calories Burned by Experience Level over Time', fontsize=16)
plt.xlabel('Session Duration (hours)', fontsize=12)
plt.ylabel('Calories Burned', fontsize=12)
plt.show()
In [11]:
age_weight_mean = df.groupby("Age")["Weight (kg)"].mean().reset_index(name="Average weight")
overall_mean = age_weight_mean["Average weight"].mean()
plt.figure(figsize=(10, 6))
sns.lineplot(data=age_weight_mean, x='Age', y='Average weight', markers=True)
plt.axhline(overall_mean, color='red', linestyle='--', linewidth=2, label=f'Average: {overall_mean:.2f} kg')
plt.text(
age_weight_mean["Age"].max(),
overall_mean - 1,
f"Average: {overall_mean:.2f} kg",
color='red',
fontsize=12
)
plt.title("Average weight by age")
plt.xlabel("Age")
plt.ylabel("Average weight (kg)")
plt.legend()
plt.show()
In [12]:
bins = [0, 25, 35, 45, 60]
labels = ['18-25', '26-35', '36-45', '46-60']
df['Age Group'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)
plt.figure(figsize=(8, 5))
df['Age Group'].value_counts(sort=False).plot(kind='bar', color='skyblue', edgecolor='black')
plt.title("Age Group Distribution")
plt.xlabel("Age Group")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.show()
In [13]:
plt.figure(figsize=(10, 6))
sns.histplot(df['Weight (kg)'], bins=15, kde=True, color='skyblue', edgecolor='black')
plt.title("Weight Distribution (kg)", fontsize=16, fontweight='bold')
plt.xlabel("Weight (kg)", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
In [14]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Water_Intake (liters)', y='Calories_Burned', data=df, color='purple')
plt.title('Water Intake vs Calories Burned', fontsize=16)
plt.xlabel('Water Intake (liters)', fontsize=12)
plt.ylabel('Calories Burned', fontsize=12)
plt.show()
In [15]:
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='Experience_Level', hue='Gender', palette='Set1')
plt.title('Gender Distribution by Experience Level', fontsize=16)
plt.xlabel('Experience Level', fontsize=12)
plt.ylabel('Count of Members', fontsize=12)
plt.show()
In [16]:
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='Workout_Type', hue='Gender', palette='Set2')
plt.title('Gender Distribution by Workout type', fontsize=16)
plt.xlabel('Workout type', fontsize=12)
plt.ylabel('Count of Members', fontsize=12)
plt.show()
In [17]:
# 确保所有列是数值型
df[['Max_BPM', 'Avg_BPM', 'Resting_BPM']] = df[['Max_BPM', 'Avg_BPM', 'Resting_BPM']].apply(pd.to_numeric, errors='coerce')
df['Workout_Frequency (days/week)'] = pd.to_numeric(df['Workout_Frequency (days/week)'], errors='coerce')
# 删除 NaN 值
df.dropna(subset=['Max_BPM', 'Avg_BPM', 'Resting_BPM'], inplace=True)
# 分组求均值
bpm_by_workout = df.groupby('Workout_Frequency (days/week)')[['Max_BPM', 'Avg_BPM', 'Resting_BPM']].mean()
# 画柱状图
bpm_by_workout.plot(kind='bar', figsize=(12, 6), color=['lightblue', 'lightgreen', 'lightcoral'])
plt.title("Average BPM by Workout Frequency", fontsize=16, fontweight='bold')
plt.xlabel("Workout Frequency (Days/Week)", fontsize=12)
plt.ylabel("BPM", fontsize=12)
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()
In [18]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Session_Duration (hours)', y='Calories_Burned', data=df, color='purple', alpha=0.7)
plt.title("Calories Burned vs Session Duration", fontsize=16, fontweight='bold')
plt.xlabel("Session Duration (hours)", fontsize=12)
plt.ylabel("Calories Burned", fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.show()
In [19]:
plt.figure(figsize=(10, 6))
sns.regplot(data=df, x='Session_Duration (hours)', y='Max_BPM', scatter_kws={'alpha':0.6}, line_kws={'color':'red'})
plt.title('Max BPM vs Session Duration with Trend Line', fontsize=16)
plt.xlabel('Session Duration (hours)', fontsize=12)
plt.ylabel('Max BPM', fontsize=12)
plt.show()
In [25]:
workout_type_counts = df['Workout_Type'].value_counts()
plt.figure(figsize=(6, 6))
# 使用更加突出的颜色调色板和阴影效果
workout_type_counts.plot(kind='pie', autopct='%1.1f%%', colors=sns.color_palette('coolwarm', len(workout_type_counts)),
wedgeprops={'edgecolor': 'black', 'linewidth': 1.5}, shadow=True,pctdistance=1.5)
# 设置标题的样式
plt.title("Distribution of Workout Types", fontsize=18, fontweight='bold', color='darkblue')
# 移除y标签
plt.ylabel("")
# 调整布局以避免文字重叠
plt.tight_layout()
# 显示图形
plt.show()
In [21]:
plt.figure(figsize=(12, 6))
sns.scatterplot(x='Session_Duration (hours)', y='Calories_Burned', hue='Experience_Level', data=df, palette='coolwarm', alpha=0.7)
plt.title("Calories Burned vs Session Duration by Experience Level", fontsize=16, fontweight='bold')
plt.xlabel("Session Duration (hours)", fontsize=12)
plt.ylabel("Calories Burned", fontsize=12)
plt.legend(title="Experience Level", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
In [22]:
plt.figure(figsize=(12, 6))
sns.scatterplot(x='BMI', y='Fat_Percentage', hue='Height (m)', data=df, palette='viridis', size='Height (m)', sizes=(20, 200), alpha=0.7)
plt.title("BMI vs Fat Percentage vs Height", fontsize=16, fontweight='bold')
plt.xlabel("BMI", fontsize=12)
plt.ylabel("Fat Percentage", fontsize=12)
plt.legend(title="Height", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
In [26]:
# 使用 get_dummies 将性别转换为虚拟变量
df_encoded = pd.get_dummies(df, columns=['Gender'], drop_first=True)
# 重新计算相关性矩阵
numeric_columns = df_encoded.select_dtypes(include=['number'])
corr_matrix = numeric_columns.corr()
# 绘制热力图
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap of Numerical Features (With Encoded Gender)', fontsize=16)
plt.show()
In [ ]: