In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('vgchartz-2024.csv')
df.head()
Out[1]:
| img | title | console | genre | publisher | developer | critic_score | total_sales | na_sales | jp_sales | pal_sales | other_sales | release_date | last_update | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | /games/boxart/full_6510540AmericaFrontccc.jpg | Grand Theft Auto V | PS3 | Action | Rockstar Games | Rockstar North | 9.4 | 20.32 | 6.37 | 0.99 | 9.85 | 3.12 | 2013-09-17 | NaN |
| 1 | /games/boxart/full_5563178AmericaFrontccc.jpg | Grand Theft Auto V | PS4 | Action | Rockstar Games | Rockstar North | 9.7 | 19.39 | 6.06 | 0.60 | 9.71 | 3.02 | 2014-11-18 | 2018-01-03 |
| 2 | /games/boxart/827563ccc.jpg | Grand Theft Auto: Vice City | PS2 | Action | Rockstar Games | Rockstar North | 9.6 | 16.15 | 8.41 | 0.47 | 5.49 | 1.78 | 2002-10-28 | NaN |
| 3 | /games/boxart/full_9218923AmericaFrontccc.jpg | Grand Theft Auto V | X360 | Action | Rockstar Games | Rockstar North | NaN | 15.86 | 9.06 | 0.06 | 5.33 | 1.42 | 2013-09-17 | NaN |
| 4 | /games/boxart/full_4990510AmericaFrontccc.jpg | Call of Duty: Black Ops 3 | PS4 | Shooter | Activision | Treyarch | 8.1 | 15.09 | 6.18 | 0.41 | 6.05 | 2.44 | 2015-11-06 | 2018-01-14 |
In [2]:
df.shape
Out[2]:
(64016, 14)
In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 64016 entries, 0 to 64015 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 img 64016 non-null object 1 title 64016 non-null object 2 console 64016 non-null object 3 genre 64016 non-null object 4 publisher 64016 non-null object 5 developer 63999 non-null object 6 critic_score 6678 non-null float64 7 total_sales 18922 non-null float64 8 na_sales 12637 non-null float64 9 jp_sales 6726 non-null float64 10 pal_sales 12824 non-null float64 11 other_sales 15128 non-null float64 12 release_date 56965 non-null object 13 last_update 17879 non-null object dtypes: float64(6), object(8) memory usage: 6.8+ MB
In [4]:
df.describe()
Out[4]:
| critic_score | total_sales | na_sales | jp_sales | pal_sales | other_sales | |
|---|---|---|---|---|---|---|
| count | 6678.000000 | 18922.000000 | 12637.000000 | 6726.000000 | 12824.000000 | 15128.000000 |
| mean | 7.220440 | 0.349113 | 0.264740 | 0.102281 | 0.149472 | 0.043041 |
| std | 1.457066 | 0.807462 | 0.494787 | 0.168811 | 0.392653 | 0.126643 |
| min | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 6.400000 | 0.030000 | 0.050000 | 0.020000 | 0.010000 | 0.000000 |
| 50% | 7.500000 | 0.120000 | 0.120000 | 0.040000 | 0.040000 | 0.010000 |
| 75% | 8.300000 | 0.340000 | 0.280000 | 0.120000 | 0.140000 | 0.030000 |
| max | 10.000000 | 20.320000 | 9.760000 | 2.130000 | 9.850000 | 3.120000 |
In [6]:
# 查看非数值型变量的描述性统计
df.describe(include='O')
Out[6]:
| img | title | console | genre | publisher | developer | release_date | last_update | |
|---|---|---|---|---|---|---|---|---|
| count | 64016 | 64016 | 64016 | 64016 | 64016 | 63999 | 56965 | 17879 |
| unique | 56177 | 39798 | 81 | 20 | 3383 | 8862 | 7922 | 1545 |
| top | /games/boxart/default.jpg | Plants vs. Zombies | PC | Misc | Unknown | Unknown | 1994-01-01 | 2018-01-06 |
| freq | 7810 | 17 | 12617 | 9304 | 8842 | 4435 | 515 | 165 |
数据可视化¶
In [8]:
# 最畅销游戏
top_selling_games=df.groupby('title')['total_sales'].sum().sort_values(ascending=False).head(10)
plt.figure(figsize=(8,6))
ax = sns.barplot(x=top_selling_games.index, y=top_selling_games.values, palette='rocket')
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x:.0f}M'))
plt.xticks(rotation=90)
plt.show()
C:\Users\黄清枫\AppData\Local\Temp\ipykernel_8740\2321252284.py:4: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. ax = sns.barplot(x=top_selling_games.index, y=top_selling_games.values, palette='rocket')
In [10]:
# 各平台最畅销游戏
platforms = df.groupby(['title', 'console'])['total_sales'].sum().sort_values(ascending=False).head(10)
titles = [index[0] for index in platforms.index]
consoles = [index[1] for index in platforms.index]
sales = platforms.values
unique_consoles = np.unique(consoles)
colors = plt.cm.tab20(np.linspace(0, 1, len(unique_consoles)))
console_colors = dict(zip(unique_consoles, colors))
sales_in_millions = sales
plt.figure(figsize=(14, 10))
for i in range(len(titles)):
console = consoles[i]
color = console_colors[console]
plt.bar(titles[i], sales_in_millions[i], color=color, label=console)
plt.xlabel('Game Titles')
plt.ylabel('Sales (Millions)')
plt.title('Top Selling Games by Platform')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Console',loc='best')
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x:.0f}M'))
plt.tight_layout()
plt.show()
In [13]:
# 评价和销量最高的类型
genre_stats=df.groupby('genre').agg({'critic_score':'median','total_sales':'sum'})
sorted_genres_by_score=genre_stats.sort_values(by='critic_score',ascending=False)
sorted_genres_by_sales=genre_stats.sort_values(by='total_sales',ascending=False)
palette = sns.color_palette("tab20", len(sorted_genres_by_score))
plt.figure(figsize=(10, 6))
# ax=sns.barplot(x=sorted_genres_by_score.index, y=sorted_genres_by_score['critic_score'], palette=palette)
ax = sns.barplot(x=sorted_genres_by_score.index,
y=sorted_genres_by_score['critic_score'],
hue=sorted_genres_by_score.index,
palette=palette,
legend=False)
plt.title('Genres with Highest Critic Scores')
plt.xlabel('Genre')
plt.ylabel('Median Critic Score')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
In [12]:
plt.figure(figsize=(12, 6))
ax=sns.barplot(x=sorted_genres_by_sales.index, y=sorted_genres_by_sales['total_sales'], palette=palette)
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x:.0f}M'))
plt.title('Genres with Highest Total Sales')
plt.xlabel('Genre')
plt.ylabel('Total Sales')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
C:\Users\黄清枫\AppData\Local\Temp\ipykernel_8740\1711725804.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. ax=sns.barplot(x=sorted_genres_by_sales.index, y=sorted_genres_by_sales['total_sales'], palette=palette)
In [15]:
# 发行产品最多的发行商及其销售业绩
publisher_stats = df.groupby('publisher').agg({
'title': 'count',
'total_sales': 'sum'
}).reset_index()
# 按发布数量对发布者进行排序
sorted_publishers_by_num = publisher_stats.sort_values(
by='title', ascending=False).head(10)
sorted_publishers_by_sales = publisher_stats.sort_values(
by='total_sales', ascending=False).head(10)
palette = sns.color_palette("tab20", len(sorted_genres_by_score))
plt.figure(figsize=(12, 6))
ax = sns.barplot(x='title', y='publisher',
data=sorted_publishers_by_num, palette=palette)
plt.title('Number of Releases by Publisher')
plt.xlabel('Number of Releases')
plt.ylabel('Publisher')
plt.tight_layout()
plt.show()
# 按出版商绘制销售业绩图
plt.figure(figsize=(12, 6))
ax=sns.barplot(x='total_sales', y='publisher', data=sorted_publishers_by_sales, palette=palette)
plt.gca().xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x:.0f}M'))
plt.title('Total Sales Performance by Publisher')
plt.xlabel('Total Sales')
plt.ylabel('Publisher')
plt.tight_layout()
plt.show()
C:\Users\黄清枫\AppData\Local\Temp\ipykernel_8740\2328367660.py:15: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. ax = sns.barplot(x='title', y='publisher', C:\Users\黄清枫\AppData\Local\Temp\ipykernel_8740\2328367660.py:15: UserWarning: The palette list has more values (20) than needed (10), which may not be intended. ax = sns.barplot(x='title', y='publisher',
C:\Users\黄清枫\AppData\Local\Temp\ipykernel_8740\2328367660.py:24: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. ax=sns.barplot(x='total_sales', y='publisher', data=sorted_publishers_by_sales, palette=palette) C:\Users\黄清枫\AppData\Local\Temp\ipykernel_8740\2328367660.py:24: UserWarning: The palette list has more values (20) than needed (10), which may not be intended. ax=sns.barplot(x='total_sales', y='publisher', data=sorted_publishers_by_sales, palette=palette)
In [16]:
# 比较不同发行商的销量
publisher_sales = df.groupby('publisher')['total_sales'].sum().reset_index()
# 按总销售额对出版商进行排序
sorted_publishers = publisher_sales.sort_values(by='total_sales', ascending=False).head(150)
# 使用点图绘制发行商的总销售额
plt.figure(figsize=(10, 20))
ax=sns.pointplot(x='total_sales', y='publisher', data=sorted_publishers, join=False, palette='muted')
plt.gca().xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x:.0f}M'))
plt.xlabel('Total Sales')
plt.ylabel('Publisher')
plt.title('Total Sales by Publisher')
plt.tight_layout()
plt.show()
C:\Users\黄清枫\AppData\Local\Temp\ipykernel_8740\2917064796.py:7: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. ax=sns.pointplot(x='total_sales', y='publisher', data=sorted_publishers, join=False, palette='muted') C:\Users\黄清枫\AppData\Local\Temp\ipykernel_8740\2917064796.py:7: UserWarning: The `join` parameter is deprecated and will be removed in v0.15.0. You can remove the line between points with `linestyle='none'`. ax=sns.pointplot(x='total_sales', y='publisher', data=sorted_publishers, join=False, palette='muted')
In [17]:
# 分析不同地区的类型偏好
genre_sales = df.groupby('genre')[['na_sales', 'jp_sales', 'pal_sales', 'other_sales']].sum()
genre_sales_normalized = genre_sales.div(genre_sales.sum(axis=1), axis=0) * 100
plt.figure(figsize=(12, 8))
sns.heatmap(genre_sales_normalized, cmap='rocket', annot=True, fmt='.1f', linewidths=.5)
plt.title('Genre Preferences Across Regions')
plt.xlabel('Region')
plt.ylabel('Genre')
plt.tight_layout()
plt.show()
In [18]:
# 检查北美,日本和PAL地区的销售分布
sales_data = df[['genre', 'na_sales', 'jp_sales', 'pal_sales']]
sales_data_melted = sales_data.melt(id_vars='genre', var_name='region', value_name='sales')
plt.figure(figsize=(12, 8))
sns.barplot(data=sales_data_melted, x='genre', y='sales', hue='region', palette='muted')
plt.title('Sales Distribution in North America, Japan, and PAL Regions by Genre')
plt.xlabel('Genre')
plt.ylabel('Sales')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Region', loc='upper right')
plt.tight_layout()
plt.show()
In [19]:
# 比较新旧平台之间的销售趋势
threshold_release_year = 2010
df['release_date'] = pd.to_datetime(df['release_date'])
df['platform_category'] = np.where(df['release_date'].dt.year < threshold_release_year, 'Older', 'Newer')
sales_trends = df.groupby(['platform_category', df['release_date'].dt.year])['total_sales'].sum().reset_index()
plt.figure(figsize=(12, 8))
sns.lineplot(data=sales_trends, x='release_date', y='total_sales', hue='platform_category', marker='o')
plt.title('Sales Trends Between Older and Newer Platforms')
plt.xlabel('Release Year')
plt.ylabel('Total Sales')
plt.legend(title='Platform Category')
plt.tight_layout()
plt.show()
In [20]:
# 每款游戏平均销量最高的平台
average_sales_per_game = df.groupby('console')['total_sales'].mean().sort_values(ascending=False).head(20)
plt.figure(figsize=(10, 8))
sns.barplot(x=average_sales_per_game.values, y=average_sales_per_game.index, palette='viridis')
plt.xlabel('Average Sales per Game')
plt.ylabel('Platform')
plt.title('Average Sales per Game by Platform')
plt.tight_layout()
plt.show()
C:\Users\黄清枫\AppData\Local\Temp\ipykernel_8740\3734249363.py:7: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=average_sales_per_game.values, y=average_sales_per_game.index, palette='viridis')
In [21]:
# 发行日期及影响
df['release_date'] = pd.to_datetime(df['release_date'])
plt.figure(figsize=(12, 6))
plt.scatter(df['release_date'], df['total_sales'], alpha=0.5,c='r',marker='^')
plt.title('Release Dates vs. Sales Performance')
plt.xlabel('Release Date')
plt.ylabel('Total Sales')
plt.tight_layout()
plt.show()
In [22]:
# 评分最高的顶级开发商,总销量超过1000万
developer_stats = df.groupby('developer').agg({'critic_score': 'mean', 'total_sales': 'sum'}).reset_index()
top_10_critic_score = developer_stats.sort_values(by=['critic_score','total_sales'],kind='quicksort', ascending=[False,False])
filtered_stats = top_10_critic_score[top_10_critic_score['total_sales'] >= 10].head(10)
plt.figure(figsize=(10,6))
sns.pointplot(x='critic_score',y='total_sales',hue='developer',data=filtered_stats)
plt.xticks(rotation=90)
plt.legend(loc='best')
plt.show()
In [23]:
# 具有持续长期销售的游戏
df['release_date'] = pd.to_datetime(df['release_date'])
long_term_period = pd.DateOffset(years=2)
df['long_term_end_date'] = df['release_date'] + long_term_period
long_term_sales = df[df['release_date'] <= df['long_term_end_date']]
long_term_sales_agg = long_term_sales.groupby('title')['total_sales'].sum().reset_index()
long_term_sales_agg = long_term_sales_agg.sort_values(by='total_sales', ascending=False)
plt.figure(figsize=(12, 6))
top_n = 10
plt.barh(long_term_sales_agg['title'].head(top_n), long_term_sales_agg['total_sales'].head(top_n), color='skyblue')
plt.xlabel('Total Sales (Millions)', fontsize=12)
plt.ylabel('Game Title', fontsize=12)
plt.title(f'Top {top_n} Games with Sustained Long-Term Sales', fontsize=14)
plt.gca().invert_yaxis()
for index, value in enumerate(long_term_sales_agg['total_sales'].head(top_n)):
plt.text(value, index, f'{value:.2f}M', ha='left', va='center', fontsize=10)
plt.tight_layout()
plt.show()
In [24]:
# 评论者评论对游戏表现的影响
df['release_date'] = pd.to_datetime(df['release_date'])
df['last_update'] = pd.to_datetime(df['last_update'])
correlation_matrix = df[['critic_score', 'total_sales', 'na_sales', 'jp_sales', 'pal_sales']].corr()
plt.figure(figsize=(10, 8))\nsns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix: Critic Scores vs. Game Performance Metrics')
plt.show()
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='critic_score', y='total_sales', hue='platform_category')
plt.title('Scatter Plot: Critic Scores vs. Total Sales (Colored by Platform Category)')
plt.xlabel('Critic Score')\nplt.ylabel('Total Sales')\nplt.legend(title='Platform Category')
plt.show()
In [ ]: