In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
df=pd.read_csv('Electric_Vehicle_Population_Data.csv')
df.head()
Out[1]:
| VIN (1-10) | County | City | State | Postal Code | Model Year | Make | Model | Electric Vehicle Type | Clean Alternative Fuel Vehicle (CAFV) Eligibility | Electric Range | Base MSRP | Legislative District | DOL Vehicle ID | Vehicle Location | Electric Utility | 2020 Census Tract | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5YJYGDEE1L | King | Seattle | WA | 98122.0 | 2020 | TESLA | MODEL Y | Battery Electric Vehicle (BEV) | Clean Alternative Fuel Vehicle Eligible | 291 | 0 | 37.0 | 125701579 | POINT (-122.30839 47.610365) | CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA) | 5.303301e+10 |
| 1 | 7SAYGDEE9P | Snohomish | Bothell | WA | 98021.0 | 2023 | TESLA | MODEL Y | Battery Electric Vehicle (BEV) | Eligibility unknown as battery range has not b... | 0 | 0 | 1.0 | 244285107 | POINT (-122.179458 47.802589) | PUGET SOUND ENERGY INC | 5.306105e+10 |
| 2 | 5YJSA1E4XK | King | Seattle | WA | 98109.0 | 2019 | TESLA | MODEL S | Battery Electric Vehicle (BEV) | Clean Alternative Fuel Vehicle Eligible | 270 | 0 | 36.0 | 156773144 | POINT (-122.34848 47.632405) | CITY OF SEATTLE - (WA)|CITY OF TACOMA - (WA) | 5.303301e+10 |
| 3 | 5YJSA1E27G | King | Issaquah | WA | 98027.0 | 2016 | TESLA | MODEL S | Battery Electric Vehicle (BEV) | Clean Alternative Fuel Vehicle Eligible | 210 | 0 | 5.0 | 165103011 | POINT (-122.03646 47.534065) | PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA) | 5.303303e+10 |
| 4 | 5YJYGDEE5M | Kitsap | Suquamish | WA | 98392.0 | 2021 | TESLA | MODEL Y | Battery Electric Vehicle (BEV) | Eligibility unknown as battery range has not b... | 0 | 0 | 23.0 | 205138552 | POINT (-122.55717 47.733415) | PUGET SOUND ENERGY INC | 5.303594e+10 |
In [2]:
df.shape
Out[2]:
(177866, 17)
In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 177866 entries, 0 to 177865 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 VIN (1-10) 177866 non-null object 1 County 177861 non-null object 2 City 177861 non-null object 3 State 177866 non-null object 4 Postal Code 177861 non-null float64 5 Model Year 177866 non-null int64 6 Make 177866 non-null object 7 Model 177866 non-null object 8 Electric Vehicle Type 177866 non-null object 9 Clean Alternative Fuel Vehicle (CAFV) Eligibility 177866 non-null object 10 Electric Range 177866 non-null int64 11 Base MSRP 177866 non-null int64 12 Legislative District 177477 non-null float64 13 DOL Vehicle ID 177866 non-null int64 14 Vehicle Location 177857 non-null object 15 Electric Utility 177861 non-null object 16 2020 Census Tract 177861 non-null float64 dtypes: float64(3), int64(4), object(10) memory usage: 23.1+ MB
In [4]:
df.describe()
Out[4]:
| Postal Code | Model Year | Electric Range | Base MSRP | Legislative District | DOL Vehicle ID | 2020 Census Tract | |
|---|---|---|---|---|---|---|---|
| count | 177861.000000 | 177866.000000 | 177866.000000 | 177866.000000 | 177477.000000 | 1.778660e+05 | 1.778610e+05 |
| mean | 98172.453506 | 2020.515512 | 58.842162 | 1073.109363 | 29.127481 | 2.202313e+08 | 5.297672e+10 |
| std | 2442.450668 | 2.989384 | 91.981298 | 8358.624956 | 14.892169 | 7.584987e+07 | 1.578047e+09 |
| min | 1545.000000 | 1997.000000 | 0.000000 | 0.000000 | 1.000000 | 4.385000e+03 | 1.001020e+09 |
| 25% | 98052.000000 | 2019.000000 | 0.000000 | 0.000000 | 18.000000 | 1.814743e+08 | 5.303301e+10 |
| 50% | 98122.000000 | 2022.000000 | 0.000000 | 0.000000 | 33.000000 | 2.282522e+08 | 5.303303e+10 |
| 75% | 98370.000000 | 2023.000000 | 75.000000 | 0.000000 | 42.000000 | 2.548445e+08 | 5.305307e+10 |
| max | 99577.000000 | 2024.000000 | 337.000000 | 845000.000000 | 49.000000 | 4.792548e+08 | 5.603300e+10 |
In [5]:
df.describe(include='O')
Out[5]:
| VIN (1-10) | County | City | State | Make | Model | Electric Vehicle Type | Clean Alternative Fuel Vehicle (CAFV) Eligibility | Vehicle Location | Electric Utility | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 177866 | 177861 | 177861 | 177866 | 177866 | 177866 | 177866 | 177866 | 177857 | 177861 |
| unique | 10830 | 196 | 723 | 46 | 40 | 139 | 2 | 3 | 861 | 76 |
| top | 7SAYGDEE6P | King | Seattle | WA | TESLA | MODEL Y | Battery Electric Vehicle (BEV) | Eligibility unknown as battery range has not b... | POINT (-122.12302 47.67668) | PUGET SOUND ENERGY INC||CITY OF TACOMA - (WA) |
| freq | 1239 | 92740 | 29447 | 177477 | 79659 | 35989 | 139210 | 91950 | 4574 | 65990 |
In [6]:
df.isnull().sum()
df.duplicated().sum()
# 可以发现,County和Postal Code有5个缺失值,倒数第五列变量有389个缺失值。删除缺失值
df.dropna(inplace=True)
In [7]:
# 可视化1:按电动汽车数量排名的十大电动汽车制造商。
# 按电动汽车数量排名的十大电动汽车制造商
ev_counts_by_make = df['Make'].value_counts().nlargest(10)
sns.set_style("whitegrid")
plt.figure(figsize=(10, 6))
sns.barplot(x=ev_counts_by_make.values, y=ev_counts_by_make.index, palette="viridis")
plt.title('Top 10 Electric Vehicle Makes by Number of Electric Vehicles', fontsize=15)
plt.xlabel('Number of Vehicles', fontsize=12)
plt.ylabel('Make', fontsize=12)
plt.show()
In [8]:
# 可视化2:随着时间的推移电动汽车的采用
sns.set_style("whitegrid")
ev_adoption_over_time = df['Model Year'].value_counts().sort_index()
plt.figure(figsize=(14, 7))
sns.lineplot(x=ev_adoption_over_time.index, y=ev_adoption_over_time.values, marker='o', color='royalblue')
plt.title('EV Adoption Over Time', fontsize=20)
plt.xlabel('Model Year', fontsize=14)
plt.ylabel('Number of EV Registrations', fontsize=14)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# “随着时间的推移,电动汽车的采用”可视化显示了电动汽车注册量的显著增长,从2010年代初开始增长,并在2020年代进一步加速增长。近年来,电动汽车的采用呈指数级增长,这意味着它正变得越来越受欢迎。
In [10]:
# 可视化3:电动汽车数量排名前10的县
ev_count_distribution = df.groupby('County')['VIN (1-10)'].count().reset_index().sort_values(by='VIN (1-10)', ascending=False)
top_ev_counties = ev_count_distribution.head(10)
plt.figure(figsize=(10, 8))
sns.barplot(x='VIN (1-10)', y='County', data=top_ev_counties, palette='viridis')
plt.title('Top 10 Counties by Electric Vehicle Counts')
plt.xlabel('Number of Electric Vehicles')
plt.ylabel('County')
plt.tight_layout()
plt.show()
# 可视化显示,金县是电动汽车采用的领先地区,其次是斯诺霍米什县和皮尔斯县。
In [11]:
# 可视化4:纯电动汽车和插电式混合动力汽车多年流行度的比较
# 过滤数据集,只包括bev和PHEV
ev_types_df = df[df['Electric Vehicle Type'].isin(['Battery Electric Vehicle (BEV)', 'Plug-in Hybrid Electric Vehicle (PHEV)'])]
# 将数据按型号、年份和数量分组
yearly_ev_counts = ev_types_df.groupby(['Model Year', 'Electric Vehicle Type']).size().unstack(fill_value=0).reset_index()
sns.set_style("whitegrid")
plt.figure(figsize=(14, 8))
yearly_ev_counts.plot(kind='bar', stacked=True, x='Model Year', figsize=(14, 8), width=0.8)
plt.title('Comparison of BEVs and PHEVs Popularity Over Years', fontsize=16)
plt.xlabel('Model Year', fontsize=14)
plt.ylabel('Number of Vehicles', fontsize=14)
plt.xticks(rotation=45)
plt.legend(title='Electric Vehicle Type', fontsize=12)
plt.tight_layout()
plt.show()
# 多年来,纯电动汽车(bev)和插电式混合动力汽车(phev)之间的比较凸显了一个明显的趋势,即纯电动汽车越来越受欢迎,尤其是在2023年。
<Figure size 1400x800 with 0 Axes>
In [13]:
# 可视化5:多年来汽车电动续航里程的改善
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Model Year', y='Electric Range', alpha=0.6)
plt.title('Improvement in Electric Range of Vehicles Over the Years')
plt.xlabel('Model Year')
plt.ylabel('Electric Range (miles)')
sns.regplot(data=df, x='Model Year', y='Electric Range', scatter=False, color='red')
plt.show()
# 随回归线的散点图显示了历年电动里程的显著改善,电动里程有明显的上升趋势。
In [14]:
# 可视化6:电动汽车价格历年分布
#过滤掉基本MSRP为零或高的行
filtered_df = df[(df['Base MSRP'] > 0) & (df['Base MSRP'] < 200000)]
sns.set_style("whitegrid")
plt.figure(figsize=(14, 8))
sns.boxplot(data=filtered_df, x='Model Year', y='Base MSRP', palette="viridis")
plt.title('Distribution of Electric Vehicle Prices Over the Years', fontsize=16)
plt.xlabel('Model Year', fontsize=14)
plt.ylabel('Base MSRP ($)', fontsize=14)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# 可视化显示了近年来电动汽车中位数价格的增长。2008年到2011年的价格与现在相比非常高。
In [15]:
# 可视化7:Top10厂商的电动汽车价格分布
top_makes = filtered_df['Make'].value_counts().nlargest(10).index
filtered_top_makes_df = filtered_df[filtered_df['Make'].isin(top_makes)]
plt.figure(figsize=(16, 10))
sns.boxplot(data=filtered_top_makes_df, x='Make', y='Base MSRP', palette="coolwarm")
plt.title('Distribution of Electric Vehicle Prices by Make (Top 10 Makes)', fontsize=16)
plt.xlabel('Make', fontsize=14)
plt.ylabel('Base MSRP ($)', fontsize=14)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# 箱线图显示了前10名电动汽车制造商的基本建议零售价的显着变化,保时捷非常受欢迎,价格范围在80000到180000之间。菲斯克的价格第二高。
In [16]:
# 可视化8:按电动汽车数量排名的十大电力公司
utility_counts = df.groupby('Electric Utility')['DOL Vehicle ID'].count().reset_index()
utility_counts_sorted = utility_counts.sort_values(by='DOL Vehicle ID', ascending=False).head(10)
plt.figure(figsize=(10, 6))
sns.barplot(data=utility_counts_sorted, x='DOL Vehicle ID', y='Electric Utility', palette='viridis', order=utility_counts_sorted['Electric Utility'])
plt.title('Top 10 Electric Utilities by Number of Electric Vehicles')
plt.xlabel('Number of Electric Vehicles')
plt.ylabel('Electric Utility')
plt.tight_layout()
plt.show()
# 柱状图按车辆数量显示了电力公用事业。普吉特海湾能源公司拥有最多的电动汽车。
In [17]:
# 可视化图9:按立法区划分的电动汽车
# 按地区分组,按车牌号统计
district_counts = df.groupby('Legislative District')['DOL Vehicle ID'].count().reset_index()
district_counts_sorted = district_counts.sort_values(by='DOL Vehicle ID', ascending=False)
plt.figure(figsize=(14, 8))
sns.barplot(x='Legislative District', y='DOL Vehicle ID', data=district_counts_sorted,
palette='coolwarm')
plt.title('Electric Vehicles by Legislative District')
plt.xlabel('Legislative District')
plt.ylabel('Number of Electric Vehicles')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# 可视化显示了各立法区在电动汽车(EV)采用方面的差异,41区、45区和48区遥遥领先。