In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('CO2 Emissions_Canada.csv')
df.head()
Out[3]:
| Make | Model | Vehicle Class | Engine Size(L) | Cylinders | Transmission | Fuel Type | Fuel Consumption City (L/100 km) | Fuel Consumption Hwy (L/100 km) | Fuel Consumption Comb (L/100 km) | Fuel Consumption Comb (mpg) | CO2 Emissions(g/km) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ACURA | ILX | COMPACT | 2.0 | 4 | AS5 | Z | 9.9 | 6.7 | 8.5 | 33 | 196 |
| 1 | ACURA | ILX | COMPACT | 2.4 | 4 | M6 | Z | 11.2 | 7.7 | 9.6 | 29 | 221 |
| 2 | ACURA | ILX HYBRID | COMPACT | 1.5 | 4 | AV7 | Z | 6.0 | 5.8 | 5.9 | 48 | 136 |
| 3 | ACURA | MDX 4WD | SUV - SMALL | 3.5 | 6 | AS6 | Z | 12.7 | 9.1 | 11.1 | 25 | 255 |
| 4 | ACURA | RDX AWD | SUV - SMALL | 3.5 | 6 | AS6 | Z | 12.1 | 8.7 | 10.6 | 27 | 244 |
In [4]:
#查看数据大小
df.shape
Out[4]:
(7385, 12)
In [8]:
#查看非数值型变量的描述性统计
df.describe()
Out[8]:
| Engine Size(L) | Cylinders | Fuel Consumption City (L/100 km) | Fuel Consumption Hwy (L/100 km) | Fuel Consumption Comb (L/100 km) | Fuel Consumption Comb (mpg) | CO2 Emissions(g/km) | |
|---|---|---|---|---|---|---|---|
| count | 7385.000000 | 7385.000000 | 7385.000000 | 7385.000000 | 7385.000000 | 7385.000000 | 7385.000000 |
| mean | 3.160068 | 5.615030 | 12.556534 | 9.041706 | 10.975071 | 27.481652 | 250.584699 |
| std | 1.354170 | 1.828307 | 3.500274 | 2.224456 | 2.892506 | 7.231879 | 58.512679 |
| min | 0.900000 | 3.000000 | 4.200000 | 4.000000 | 4.100000 | 11.000000 | 96.000000 |
| 25% | 2.000000 | 4.000000 | 10.100000 | 7.500000 | 8.900000 | 22.000000 | 208.000000 |
| 50% | 3.000000 | 6.000000 | 12.100000 | 8.700000 | 10.600000 | 27.000000 | 246.000000 |
| 75% | 3.700000 | 6.000000 | 14.600000 | 10.200000 | 12.600000 | 32.000000 | 288.000000 |
| max | 8.400000 | 16.000000 | 30.600000 | 20.600000 | 26.100000 | 69.000000 | 522.000000 |
In [10]:
#统计缺失值情况
df.isnull().sum()
Out[10]:
Make 0 Model 0 Vehicle Class 0 Engine Size(L) 0 Cylinders 0 Transmission 0 Fuel Type 0 Fuel Consumption City (L/100 km) 0 Fuel Consumption Hwy (L/100 km) 0 Fuel Consumption Comb (L/100 km) 0 Fuel Consumption Comb (mpg) 0 CO2 Emissions(g/km) 0 dtype: int64
In [11]:
df.duplicated().sum()
Out[11]:
1103
In [12]:
df.drop_duplicates(inplace=True)
In [13]:
df.duplicated().sum()
Out[13]:
0
In [14]:
# 绘制直方图的发动机尺寸,燃料消耗,和二氧化碳排放
fig, axs = plt.subplots(1, 3, figsize=(15, 5))
axs[0].hist(df['Engine Size(L)'], bins=20, color='skyblue', edgecolor='black')
axs[0].set_title('Engine Size Distribution')
axs[0].set_xlabel('Engine Size (L)')
axs[0].set_ylabel('Frequency')
axs[1].hist(df['Fuel Consumption Comb (L/100 km)'], bins=20, color='salmon', edgecolor='black')
axs[1].set_title('Fuel Consumption Distribution')
axs[1].set_xlabel('Fuel Consumption Comb (L/100 km)')
axs[1].set_ylabel('Frequency')
axs[2].hist(df['CO2 Emissions(g/km)'], bins=20, color='lightgreen', edgecolor='black')
axs[2].set_title('CO2 Emissions Distribution')
axs[2].set_xlabel('CO2 Emissions (g/km)')
axs[2].set_ylabel('Frequency')
plt.tight_layout()
plt.show()
In [15]:
# 创建引擎尺寸与燃料消耗的散点图
plt.figure(figsize=(8, 6))
plt.scatter(df['Engine Size(L)'], df['Fuel Consumption Comb (L/100 km)'], color='blue', alpha=0.5)
plt.title('Engine Size vs. Fuel Consumption')
plt.xlabel('Engine Size (L)')
plt.ylabel('Fuel Consumption Comb (L/100 km)')
plt.grid(True)
plt.show()
# 创建发动机尺寸与二氧化碳排放量的散点图
plt.figure(figsize=(8, 6))
plt.scatter(df['Engine Size(L)'], df['CO2 Emissions(g/km)'], color='green', alpha=0.5)
plt.title('Engine Size vs. CO2 Emissions')
plt.xlabel('Engine Size (L)')
plt.ylabel('CO2 Emissions (g/km)')
plt.grid(True)
plt.show()
In [16]:
# 创建不同车辆类别的燃油消耗箱形图
plt.figure(figsize=(10, 6))
sns.boxplot(x='Vehicle Class', y='Fuel Consumption Comb (L/100 km)', data=df)
plt.title('Fuel Consumption by Vehicle Class')
plt.xlabel('Vehicle Class')
plt.ylabel('Fuel Consumption Comb (L/100 km)')
plt.xticks(rotation=45)
plt.show()
# 创建不同燃料类型的二氧化碳排放箱形图
plt.figure(figsize=(8, 6))
sns.boxplot(x='Fuel Type', y='CO2 Emissions(g/km)', data=df)
plt.title('CO2 Emissions by Fuel Type')
plt.xlabel('Fuel Type')
plt.ylabel('CO2 Emissions (g/km)')
plt.show()
In [17]:
# 按车辆类别计算平均燃油消耗量和二氧化碳排放量
avg_fuel_consumption_class = df.groupby('Vehicle Class')['Fuel Consumption Comb (L/100 km)'].mean().reset_index()
avg_co2_emissions_class = df.groupby('Vehicle Class')['CO2 Emissions(g/km)'].mean().reset_index()
# 使用Seaborn按车辆类别创建平均燃料消耗和二氧化碳排放的条形图
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.barplot(x='Vehicle Class', y='Fuel Consumption Comb (L/100 km)', data=avg_fuel_consumption_class, palette='Blues')
plt.title('Average Fuel Consumption by Vehicle Class')
plt.xlabel('Vehicle Class')
plt.ylabel('Average Fuel Consumption (L/100 km)')
plt.xticks(rotation=45)
plt.subplot(1, 2, 2)
sns.barplot(x='Vehicle Class', y='CO2 Emissions(g/km)', data=avg_co2_emissions_class, palette='Greens')
plt.title('Average CO2 Emissions by Vehicle Class')
plt.xlabel('Vehicle Class')
plt.ylabel('Average CO2 Emissions (g/km)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
In [18]:
# 选择相关列进行相关性分析
correlation_cols = ['Engine Size(L)', 'Cylinders',
'Fuel Consumption Comb (L/100 km)', 'CO2 Emissions(g/km)']
# 计算Pearson相关系数
correlation_matrix = df[correlation_cols].corr()
# 热力图
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 10})
plt.title('Correlation Matrix')
plt.show()
In [19]:
#构建预测模型
# 首先准备建模数据X和y,接着拆分数据集为训练集和测试集,最后构建线性回归模型,并打印模型评估指标
#选择预测变量(特征)和目标变量
X = df[['Engine Size(L)', 'Cylinders']] # 特点:发动机尺寸和气缸数
y = df['Fuel Consumption Comb (L/100 km)'] # 目标变量:燃料消耗
# 将数据分成训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 初始化并拟合线性回归模型
model = LinearRegression()
model.fit(X_train, y_train)
# 对测试集进行预测
y_pred = model.predict(X_test)
# 模型评估
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R-squared Score:", r2)
Mean Squared Error: 2.7114779983394364 R-squared Score: 0.6846010701229581
In [20]:
# 绘制实际与预测的二氧化碳排放量
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='green', alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2) # Diagonal line
plt.title('Actual vs Predicted CO2 Emissions')
plt.xlabel('Actual CO2 Emissions (g/km)')
plt.ylabel('Predicted CO2 Emissions (g/km)')
plt.grid(True)
plt.show()
In [21]:
# 按发动机大小分组,计算平均燃油消耗量和二氧化碳排放量
avg_fuel_consumption_engine = df.groupby('Engine Size(L)')['Fuel Consumption Comb (L/100 km)'].mean()
avg_co2_emissions_engine = df.groupby('Engine Size(L)')['CO2 Emissions(g/km)'].mean()
plt.figure(figsize=(10, 6))
plt.plot(avg_fuel_consumption_engine.index, avg_fuel_consumption_engine, marker='o', label='Average Fuel Consumption', color='skyblue')
plt.plot(avg_co2_emissions_engine.index, avg_co2_emissions_engine, marker='o', label='Average CO2 Emissions', color='lightgreen')
plt.xlabel('Engine Size (L)')
plt.ylabel('Average Value')
plt.title('Average Fuel Consumption and CO2 Emissions by Engine Size')
plt.legend()
plt.grid(True)
plt.show()
In [22]:
# 聚类分析:通过基于发动机尺寸、气缸和燃料消耗等特征对车辆进行聚类来探索车辆之间的异同
# 选择聚类的特征
features = ['Engine Size(L)', 'Cylinders', 'Fuel Consumption Comb (L/100 km)']
# 从数据集中提取特征
X = df[features]
# 选择集群的数量(可以根据自己的需求更改此值)
num_clusters = 3
# 初始化K-means聚类算法
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
# 拟合数据的K-means
kmeans.fit(X)
# 为每个数据点分配集群标签
df['Cluster'] = kmeans.labels_
# 集群可视化
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Engine Size(L)', y='Fuel Consumption Comb (L/100 km)', hue='Cluster', data=df, palette='Set1')
plt.title('Clustering Based on Engine Size and Fuel Consumption')
plt.xlabel('Engine Size (L)')
plt.ylabel('Fuel Consumption (L/100 km)')
plt.show()
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Cylinders', y='Fuel Consumption Comb (L/100 km)', hue='Cluster', data=df, palette='Set2')
plt.title('Clustering Based on Cylinders and Fuel Consumption')
plt.xlabel('Cylinders')
plt.ylabel('Fuel Consumption (L/100 km)')
plt.show()
In [ ]: