关于数据集 该数据集全面展示了汽车领域的燃油经济性,探索了不同年份、品牌和型号的各种车辆的详细信息。它包括发动机规格、燃料类型、传动系统的信息,甚至包括燃油经济性、二氧化碳排放量和技术特性等因素。
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
plt.style.use('ggplot')
import warnings
warnings.filterwarnings('ignore')
# 导入数据
df = pd.read_csv('fuel.csv')
df.head()
df.shape
df.info()
df.columns
df.describe()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 38113 entries, 0 to 38112 Data columns (total 81 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 vehicle_id 38113 non-null int64 1 year 38113 non-null int64 2 make 38113 non-null object 3 model 38113 non-null object 4 class 38113 non-null object 5 drive 36924 non-null object 6 transmission 38102 non-null object 7 transmission_type 15045 non-null object 8 engine_index 38113 non-null int64 9 engine_descriptor 22693 non-null object 10 engine_cylinders 37977 non-null float64 11 engine_displacement 37979 non-null float64 12 turbocharger 5239 non-null object 13 supercharger 0 non-null float64 14 fuel_type 38113 non-null object 15 fuel_type_1 38113 non-null object 16 fuel_type_2 0 non-null float64 17 city_mpg_ft1 38113 non-null int64 18 unrounded_city_mpg_ft1 38113 non-null float64 19 city_mpg_ft2 38113 non-null int64 20 unrounded_city_mpg_ft2 38113 non-null float64 21 city_gasoline_consumption_cd 38113 non-null float64 22 city_electricity_consumption 38113 non-null float64 23 city_utility_factor 38113 non-null float64 24 highway_mpg_ft1 38113 non-null int64 25 unrounded_highway_mpg_ft1 38113 non-null float64 26 highway_mpg_ft2 38113 non-null int64 27 unrounded_highway_mpg_ft2 38113 non-null float64 28 highway_gasoline_consumption_cd 38113 non-null float64 29 highway_electricity_consumption 38113 non-null float64 30 highway_utility_factor 38113 non-null float64 31 unadjusted_city_mpg_ft1 38113 non-null float64 32 unadjusted_highway_mpg_ft1 38113 non-null float64 33 unadjusted_city_mpg_ft2 38113 non-null float64 34 unadjusted_highway_mpg_ft2 38113 non-null float64 35 combined_mpg_ft1 38113 non-null int64 36 unrounded_combined_mpg_ft1 38113 non-null float64 37 combined_mpg_ft2 38113 non-null int64 38 unrounded_combined_mpg_ft2 38113 non-null float64 39 combined_electricity_consumption 38113 non-null float64 40 combined_gasoline_consumption_cd 38113 non-null float64 41 combined_utility_factor 38113 non-null float64 42 annual_fuel_cost_ft1 38113 non-null int64 43 annual_fuel_cost_ft2 38113 non-null int64 44 gas_guzzler_tax 964 non-null object 45 save_or_spend_5_year 38113 non-null int64 46 annual_consumption_in_barrels_ft1 38113 non-null float64 47 annual_consumption_in_barrels_ft2 38113 non-null float64 48 tailpipe_co2_ft1 38113 non-null int64 49 tailpipe_co2_in_grams_mile_ft1 38113 non-null float64 50 tailpipe_co2_ft2 38113 non-null int64 51 tailpipe_co2_in_grams_mile_ft2 38113 non-null float64 52 fuel_economy_score 38113 non-null int64 53 ghg_score 38113 non-null int64 54 ghg_score_alt_fuel 38113 non-null int64 55 my_mpg_data 38113 non-null object 56 x2d_passenger_volume 38113 non-null int64 57 x2d_luggage_volume 38113 non-null int64 58 x4d_passenger_volume 38113 non-null int64 59 x4d_luggage_volume 38113 non-null int64 60 hatchback_passenger_volume 38113 non-null int64 61 hatchback_luggage_volume 38113 non-null int64 62 start_stop_technology 0 non-null float64 63 alternative_fuel_technology 3047 non-null object 64 electric_motor 0 non-null float64 65 manufacturer_code 0 non-null float64 66 gasoline_electricity_blended_cd 38113 non-null bool 67 vehicle_charger 0 non-null float64 68 alternate_charger 0 non-null float64 69 hours_to_charge_120v 38113 non-null int64 70 hours_to_charge_240v 38113 non-null float64 71 hours_to_charge_ac_240v 38113 non-null float64 72 composite_city_mpg 38113 non-null int64 73 composite_highway_mpg 38113 non-null int64 74 composite_combined_mpg 38113 non-null int64 75 range_ft1 38113 non-null int64 76 city_range_ft1 38113 non-null float64 77 highway_range_ft1 38113 non-null float64 78 range_ft2 0 non-null float64 79 city_range_ft2 38113 non-null float64 80 highway_range_ft2 38113 non-null float64 dtypes: bool(1), float64(39), int64(28), object(13) memory usage: 23.3+ MB
Out[1]:
| vehicle_id | year | engine_index | engine_cylinders | engine_displacement | supercharger | fuel_type_2 | city_mpg_ft1 | unrounded_city_mpg_ft1 | city_mpg_ft2 | ... | hours_to_charge_ac_240v | composite_city_mpg | composite_highway_mpg | composite_combined_mpg | range_ft1 | city_range_ft1 | highway_range_ft1 | range_ft2 | city_range_ft2 | highway_range_ft2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 38113.000000 | 38113.000000 | 38113.000000 | 37977.000000 | 37979.000000 | 0.0 | 0.0 | 38113.000000 | 38113.000000 | 38113.000000 | ... | 38113.000000 | 38113.000000 | 38113.000000 | 38113.000000 | 38113.000000 | 38113.000000 | 38113.000000 | 0.0 | 38113.000000 | 38113.000000 |
| mean | 19170.638496 | 2000.194527 | 8799.389001 | 5.736656 | 3.317583 | NaN | NaN | 17.981109 | 4.606426 | 0.546218 | ... | 0.005549 | 0.082203 | 0.080891 | 0.081311 | 0.469708 | 0.426249 | 0.419197 | NaN | 0.043973 | 0.040051 |
| std | 11134.878665 | 10.464573 | 17781.058490 | 1.752254 | 1.361995 | NaN | NaN | 6.849728 | 10.113963 | 4.109282 | ... | 0.161014 | 2.156682 | 2.052187 | 2.097794 | 9.352069 | 9.104702 | 9.315914 | NaN | 1.311628 | 1.169281 |
| min | 1.000000 | 1984.000000 | 0.000000 | 2.000000 | 0.000000 | NaN | NaN | 6.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN | 0.000000 | 0.000000 |
| 25% | 9529.000000 | 1991.000000 | 0.000000 | 4.000000 | 2.200000 | NaN | NaN | 15.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN | 0.000000 | 0.000000 |
| 50% | 19058.000000 | 2001.000000 | 212.000000 | 6.000000 | 3.000000 | NaN | NaN | 17.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN | 0.000000 | 0.000000 |
| 75% | 28779.000000 | 2009.000000 | 4451.000000 | 6.000000 | 4.300000 | NaN | NaN | 20.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | NaN | 0.000000 | 0.000000 |
| max | 38542.000000 | 2017.000000 | 69102.000000 | 16.000000 | 8.400000 | NaN | NaN | 150.000000 | 150.000000 | 145.000000 | ... | 7.000000 | 97.000000 | 81.000000 | 88.000000 | 315.000000 | 305.900000 | 346.900000 | NaN | 103.030000 | 90.550000 |
8 rows × 67 columns
In [2]:
# 使用选定的列创建原始数据框架的子集
df = df[['vehicle_id', 'year', 'make', 'model', 'class', 'drive', 'transmission',
'engine_cylinders', 'engine_displacement', 'turbocharger',
'supercharger', 'fuel_type',
'city_mpg_ft1',
'city_gasoline_consumption_cd',
'city_electricity_consumption', 'city_utility_factor',
'highway_mpg_ft1',
'combined_mpg_ft1',
'annual_fuel_cost_ft1',
'tailpipe_co2_in_grams_mile_ft1',
'start_stop_technology',
'electric_motor',
'gasoline_electricity_blended_cd',
]].copy() # 创建数据帧的副本,而不是引用
In [3]:
# 重命名列以获得更好的用户解释
df = df.rename(columns={
'vehicle_id':'Vehicle_ID',
'year':'Year',
'make':'Make',
'model':'Model',
'class':'Class',
'drive':'Drivetrain',
'transmission':'Transmission',
'engine_cylinders':'Cylinders',
'engine_displacement':'Displacement',
'turbocharger':'Turbo',
'supercharger':'Supercharger',
'fuel_type':'Fuel_Type',
'city_mpg_ft1':'City_MPG',
'citry_gasoline_consumption_cd':'City_Gas_Consumption',
'city_electric_consumption':'City_Electricity_Consumption',
'city_utility_factor':'City_Utility_Factor',
'highway_mpg_ft1':'Highway_MPG',
'combined_mpg_ft1':'Combined_MPG',
'annual_fuel_cost_ft1':'Annual_Fuel_Cost',
'tailpipe_co2_in_grams_mile_ft1':'CO2_Emissions_g',
'start_stop_technology':'Start_Stop',
'electric_motor':'Electric Motor',
'gasoline_electricity_blended_cd':'Hybrid'
})
In [4]:
df.dtypes
Out[4]:
Vehicle_ID int64 Year int64 Make object Model object Class object Drivetrain object Transmission object Cylinders float64 Displacement float64 Turbo object Supercharger float64 Fuel_Type object City_MPG int64 city_gasoline_consumption_cd float64 city_electricity_consumption float64 City_Utility_Factor float64 Highway_MPG int64 Combined_MPG int64 Annual_Fuel_Cost int64 CO2_Emissions_g float64 Start_Stop float64 Electric Motor float64 Hybrid bool dtype: object
In [6]:
# 根据每年引进的汽车数量创建一个条形图
ax = df['Year'].value_counts().head(10).plot(kind='bar', title='Top Years Cars Introduced')
ax.set_xlabel('Year Introduced')
ax.set_ylabel('Count')
plt.show()
In [7]:
# 创建直方图,以确定联合燃油经济性的频率
ax = df['Combined_MPG'].plot(kind='hist', bins=40, title='Vehicle Fuel Economy', )
ax.set_xlabel('Fuel Economy (MPG)')
plt.show()
In [8]:
# 创建KDE图以确定组合燃油经济性的频率
ax = df['Combined_MPG'].plot(kind='kde', title='Coaster Speed (mph)')
ax.set_xlabel('Fuel Economy (MPG)')
plt.show()
In [9]:
# 散点图显示排量与综合MPG分布
df.plot(kind='scatter', x='Displacement', y='Combined_MPG', title='Coaster Speed vs Height')
plt.show()
In [10]:
# 使用Seaborn的散点图来提供位移与基于引入年份的色调组合MPG
sns.scatterplot(x='Displacement', y='Combined_MPG', data = df, hue='Year')
plt.show()
df_corr = df[['Year', 'Combined_MPG','Hybrid', 'CO2_Emissions_g', 'Annual_Fuel_Cost']].dropna().corr()
sns.heatmap(df_corr, annot=True)
plt.show()
In [11]:
# 哪一年新车的平均燃油经济性超过每加仑20英里?
# 按年分组,计算平均综合MPG
average_mpg_by_year = df.groupby('Year')['Combined_MPG'].mean()
# 寻找平均燃油经济性超过20英里/加仑的年份
year_exceeding_20mpg = average_mpg_by_year[average_mpg_by_year > 20].index.min()
print("The year when the average fuel economy of new cars exceeded 20 MPG is:", year_exceeding_20mpg)
The year when the average fuel economy of new cars exceeded 20 MPG is: 2010
In [12]:
# 在过去十年中,燃油经济性排名前十的汽车品牌是什么?(2007 - 2017)
# 过滤2007-2017年的数据
df_decade = df[df['Year'].between(2007, 2017)]
# 计算每个汽车品牌的平均燃油经济性
average_mpg_by_brand = df_decade.groupby('Make')['Combined_MPG'].mean()
# 根据平均燃油经济性对汽车品牌进行排序,并选出前10名
top_10_brands = average_mpg_by_brand.sort_values(ascending=False).head(10)
# 绘制条形图
plt.figure(figsize=(10, 6))
top_10_brands.plot(kind='bar', color='skyblue')
plt.title('Top 10 Car Brands based on Fuel Economy (2007-2017)')
plt.xlabel('Car Brand')
plt.ylabel('Average Combined MPG')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
In [13]:
# 不同燃料类型和气缸的二氧化碳排放量比较如何?
# 过滤掉在燃料类型和气缸列中缺少值的行
df_filtered = df.dropna(subset=['Fuel_Type', 'Cylinders'])
# 按“燃料类型”和“气缸”对数据进行分组,并计算平均二氧化碳排放量
avg_co2_by_fuel_and_cylinders = df_filtered.groupby(['Fuel_Type', 'Cylinders'])['CO2_Emissions_g'].mean().unstack()
# 绘制分组条形图
avg_co2_by_fuel_and_cylinders.plot(kind='bar', stacked=False)
plt.title('Average CO2 Emissions by Fuel Type and Cylinder Count')
plt.xlabel('Fuel Type')
plt.ylabel('Average CO2 Emissions (g)')
plt.xticks(rotation=90)
plt.legend(title='Cylinders', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
In [15]:
# 某些传动系统的流行有什么趋势吗?
grouped_by_year_class = df.groupby(['Year', 'Class']).size().unstack()
# 将数据按年份和传动系统分组
grouped_by_year_drivetrain = df.groupby(['Year', 'Drivetrain']).size().unstack()
# 绘制动力传动系统的趋势图
plt.figure(figsize=(12, 10))
grouped_by_year_drivetrain.plot(kind='line', marker='o', linewidth=2)
plt.title('Trends in Drivetrain Popularity Over Time')
plt.xlabel('Year')
plt.ylabel('Number of Vehicles')
plt.legend(title='Drivetrain', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()
<Figure size 1200x1000 with 0 Axes>
In [16]:
# 这些年来,汽车的平均燃油经济性(综合mpg)发生了怎样的变化?
# 将数据按年分组,计算平均组合mpg
avg_mpg_by_year = df.groupby('Year')['Combined_MPG'].mean()
# 绘制平均燃油经济性随时间变化的趋势
plt.figure(figsize=(10, 6))
avg_mpg_by_year.plot(kind='line', marker='o', color='b', linewidth=2)
plt.title('Average Fuel Economy (Combined MPG) Over Time')
plt.xlabel('Year')
plt.ylabel('Average Combined MPG')
plt.grid(True)
plt.tight_layout()
plt.show()
In [17]:
# 不同类型的变速箱(如手动、自动、无级变速),平均燃油经济性有何不同?
# 筛选相关列
transmission_data = df[['Transmission', 'Combined_MPG']]
# 将数据按传输类型分组,计算平均组合mpg
avg_mpg_by_transmission = transmission_data.groupby('Transmission')['Combined_MPG'].mean().sort_values(ascending=False)
# 选择十大最高效的变速器
top_ten_transmissions = avg_mpg_by_transmission.head(10)
# 绘制出十大最高效变速器类型的平均燃油经济性
plt.figure(figsize=(10, 6))
top_ten_transmissions.plot(kind='bar', color='skyblue')
plt.title('Top Ten Most Efficient Transmissions')
plt.xlabel('Transmission Type')
plt.ylabel('Average Combined MPG')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.tight_layout()
plt.show()
In [ ]: