In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv("salaries.csv")
df.head()
df.shape
df.info()
df.describe()
df.describe(include='O')
df.isnull().sum()
df.duplicated().sum()
df.drop_duplicates(inplace=True)
df.duplicated().sum()
# 工资分布直方图
plt.figure(figsize=(8, 6))
sns.histplot(data=df, x='salary_in_usd', kde=True)
plt.title('Distribution of Salaries (USD)')
plt.xlabel('Salary (USD)')
plt.ylabel('Frequency')
plt.show()
# -工资分布似乎是右偏的,长尾倾向于更高的工资。
# -大部分人的工资在101,000 - 186,000美元之间。
# -工资中位数约为14.13万美元,这表明有一半的工资高于/低于此值。
# -有一些异常值的薪水很高,这可能需要进一步调查。
<class 'pandas.core.frame.DataFrame'> RangeIndex: 16494 entries, 0 to 16493 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 work_year 16494 non-null int64 1 experience_level 16494 non-null object 2 employment_type 16494 non-null object 3 job_title 16494 non-null object 4 salary 16494 non-null int64 5 salary_currency 16494 non-null object 6 salary_in_usd 16494 non-null int64 7 employee_residence 16494 non-null object 8 remote_ratio 16494 non-null int64 9 company_location 16494 non-null object 10 company_size 16494 non-null object dtypes: int64(4), object(7) memory usage: 1.4+ MB
In [2]:
# 前20个职位标题的条形图
top_20_titles = df['job_title'].value_counts().head(20)
# 为排名前20的职位创建一个条形图
plt.figure(figsize=(12, 6))
plt.bar(top_20_titles.index, top_20_titles.values)
plt.xlabel('Job Title')
plt.ylabel('Count')
plt.title('Top 20 Job Titles')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
# 该柱状图揭示了以下见解:
# 在前20名中,最常见的职位是数据工程师,这表明对该职位的需求很高。
# 其他突出的职位包括数据科学家、数据分析师和机器学习工程师,这表明它们在数据集中的重要性。
# 排名前20的职位出现的频率各不相同,有些职位的出现频率明显高于其他职位。
In [5]:
# 职位名称词云
top_20_titles = df['job_title'].value_counts().head(20)
title_counts = dict(top_20_titles)
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(title_counts)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Top 20 - Job Title Word Cloud')
plt.show()
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[5], line 9 5 title_counts = dict(top_20_titles) 7 from wordcloud import WordCloud ----> 9 wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(title_counts) 11 plt.figure(figsize=(10, 6)) 13 plt.imshow(wordcloud, interpolation='bilinear') File D:\Python310\lib\site-packages\wordcloud\wordcloud.py:453, in WordCloud.generate_from_frequencies(self, frequencies, max_font_size) 451 font_size = self.height 452 else: --> 453 self.generate_from_frequencies(dict(frequencies[:2]), 454 max_font_size=self.height) 455 # find font sizes 456 sizes = [x[1] for x in self.layout_] File D:\Python310\lib\site-packages\wordcloud\wordcloud.py:508, in WordCloud.generate_from_frequencies(self, frequencies, max_font_size) 505 transposed_font = ImageFont.TransposedFont( 506 font, orientation=orientation) 507 # get size of resulting text --> 508 box_size = draw.textbbox((0, 0), word, font=transposed_font, anchor="lt") 509 # find possible places using integral image: 510 result = occupancy.sample_position(box_size[3] + self.margin, 511 box_size[2] + self.margin, 512 random_state) File D:\Python310\lib\site-packages\PIL\ImageDraw.py:671, in ImageDraw.textbbox(self, xy, text, font, anchor, spacing, align, direction, features, language, stroke_width, embedded_color) 669 font = self.getfont() 670 if not isinstance(font, ImageFont.FreeTypeFont): --> 671 raise ValueError("Only supported for TrueType fonts") 672 mode = "RGBA" if embedded_color else self.fontmode 673 bbox = font.getbbox( 674 text, mode, direction, features, language, stroke_width, anchor 675 ) ValueError: Only supported for TrueType fonts
In [6]:
# 前10名职位树图-树状图使用嵌套的矩形来表示每个职位的相对频率。
job_title_counts = df['job_title'].value_counts().head(10)
import squarify
plt.figure(figsize=(12, 6))
squarify.plot(sizes=job_title_counts, label=job_title_counts.index, alpha=0.8)
plt.axis('off')
plt.title('Job Title Treemap')
plt.show()
In [7]:
# 公司规模分析
# 变量
size=df['company_size']
salary=df['salary_in_usd']
#公司规模
print("Value Counts by Company Size:")
company_size_counts = df['company_size'].value_counts()
print(company_size_counts)
# 计算组均值和中位数
print("Mean Salaries by Company Size:")
print(df.groupby('company_size')['salary_in_usd'].mean().to_string(index=True))
print("\nMedian Salaries by Company Size:")
print(df.groupby('company_size')['salary_in_usd'].median().to_string(index=True))
#散点图
plt.figure(figsize=(16, 6))
plt.scatter(salary,size)
plt.xlabel('Salary in USD')
plt.ylabel('Company Size')
plt.title('Salary vs. Company Size')
plt.show()
Value Counts by Company Size: company_size M 9265 L 641 S 187 Name: count, dtype: int64 Mean Salaries by Company Size: company_size L 121670.976599 M 149588.921749 S 86780.197861 Median Salaries by Company Size: company_size L 108000.0 M 140250.0 S 72000.0
In [9]:
#箱型图
plt.figure(figsize=(10, 6))
box_plot_data = [df[df['company_size'] == size]['salary_in_usd'] for size in ['S', 'M', 'L']]
box_plot = plt.boxplot(box_plot_data, patch_artist=True)
colors = ['lightblue', 'lightgreen', 'pink']
for patch, color in zip(box_plot['boxes'], colors):
patch.set_facecolor(color)
plt.xticks([1, 2, 3], ['Small', 'Medium', 'Large'])
plt.xlabel('Company Size')
plt.ylabel('Salary (USD)')
plt.title('Salary Distribution by Company Size')
plt.legend(box_plot['boxes'], ['Small', 'Medium', 'Large'], title='Company Size', loc='upper right')
plt.show()
# 中型企业的平均工资和中位数在这三类企业中是最高的。
# 大公司的平均工资和中位数是第二高的。
# 小公司的平均工资和中位数最低。
# 所有公司规模的工资中位数都低于平均工资,这表明工资分布略有右倾斜(即,有一些高异常值将平均值向上拉)。
In [10]:
# 按经验水平分析工资分配
# 计算每个经验级别的工资中位数
experience_salaries = df.groupby('experience_level')['salary_in_usd'].median().reset_index()
# 按工资中位数降序排序经验水平
experience_salaries_sorted = experience_salaries.sort_values('salary_in_usd', ascending=False)
# 将经验等级与原始数据合并
df_Ranked = pd.merge(df, experience_salaries_sorted, on='experience_level')
# 创建了一个字典,将经验等级映射到他们的等级
experience_rank = {'EN': 0, 'MI': 1, 'SE': 2, 'EX': 3}
df['experience_rank'] = df['experience_level'].map(experience_rank)
print("Experience Levels Ranked by Median Salary:")
print(experience_salaries_sorted)
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='experience_level', y='salary_in_usd')
plt.title('Salary Distribution by Experience Level')
plt.xlabel('Experience Level')
plt.ylabel('Salary (USD)')
plt.xticks([0, 1, 2, 3], ['EN', 'MI', 'SE', 'EX'])
plt.show()
# 按工资中位数对经验等级排序¶
# 首先,我们根据他们的工资中位数对经验水平进行排名。这让我们初步了解了不同经验水平的薪水是如何不同的。
# 输出结果显示,经验等级排序如下:
# EX(行政人员),平均工资为18万美元
# SE(高级),工资中位数为165,000美元
# MI(中级),工资中位数为129,900美元
# EN(入门级),工资中位数为85,750美元
Experience Levels Ranked by Median Salary: experience_level salary_in_usd 1 EX 192000.0 3 SE 154400.0 2 MI 112000.0 0 EN 80455.5
In [11]:
# 按经验水平及受雇类别划分的薪金分布
plt.figure(figsize=(12, 6))
sns.set_color_codes("pastel")
sns.barplot(x='experience_level', y='salary_in_usd', hue='employment_type', data=df,)
plt.title('Salary Distribution by Experience Level and Employment Type')
plt.xlabel('Experience Level')
plt.ylabel('Salary (USD)')
plt.legend(title='Employment Type')
plt.show()
In [12]:
# 历年按经验水平划分的平均工资
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='work_year', y='salary_in_usd', hue='experience_level', estimator='mean', ci=None)
plt.title('Average salaries by level of experience over the years')
plt.xlabel('Year')
plt.ylabel('Salary (USD)')
plt.legend(title='Experience level', labels=['EN', 'MI', 'SE', 'EX'])
plt.show()
In [13]:
# 历年的薪资趋势
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='work_year', y='salary_in_usd', estimator='mean', ci=None)
plt.title('Salary trends over the years')
plt.xlabel('Year')
plt.ylabel('Salary (USD)')
plt.show()
In [14]:
# 按公司规模划分的历年平均工资
plt.figure(figsize=(10, 6))
df.groupby(['work_year', 'company_size'])['salary_in_usd'].mean().unstack().plot(kind='line', marker='o')
plt.title('Average Salaries by Company Size Over the Years')
plt.xlabel('Year')
plt.ylabel('Average salary (USD)')
plt.legend(title='Company size')
plt.show()
<Figure size 1000x600 with 0 Axes>
In [15]:
# 相关性分析
numeric_columns = df.select_dtypes(include=['number'])
correlation_matrix = numeric_columns.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()
df.head()
new_df = df.drop(['salary','salary_currency'],axis=1)
new_df.head()
Out[15]:
| work_year | experience_level | employment_type | job_title | salary_in_usd | employee_residence | remote_ratio | company_location | company_size | experience_rank | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2024 | MI | FT | Data Scientist | 120000 | AU | 0 | AU | S | 1 |
| 1 | 2024 | MI | FT | Data Scientist | 70000 | AU | 0 | AU | S | 1 |
| 2 | 2024 | MI | CT | Data Scientist | 130000 | US | 0 | US | M | 1 |
| 3 | 2024 | MI | CT | Data Scientist | 110000 | US | 0 | US | M | 1 |
| 4 | 2024 | MI | FT | Data Science Manager | 240000 | US | 0 | US | M | 1 |
In [18]:
# 相关性分析
numeric_columns = df.select_dtypes(include=['number'])
correlation_matrix = numeric_columns.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()
df.head()
new_df = df.drop(['salary', 'salary_currency'], axis=1)
new_df.head()
from sklearn.preprocessing import LabelEncoder
for col in new_df.describe(include='O'):
new_df[col] = LabelEncoder().fit_transform(new_df[col])
new_df.head()
sns.boxplot(data=new_df, y='salary_in_usd')
new_df = new_df[new_df['salary_in_usd'] < 400000]
from sklearn.model_selection import train_test_split
X = new_df.drop('salary_in_usd', axis=1)
y = new_df['salary_in_usd']
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
import seaborn as sns
import matplotlib.pylab as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] # 解决中文显示
plt.rcParams['axes.unicode_minus'] = False # 解决符号无法显示
sns.set(font='SimHei')
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np
# 定义一个训练模型并对模型各个指标进行评估的函数
def train_model(ml_model):
print("Model is: ", ml_model)
model = ml_model.fit(X_train, y_train)
print("Training score: ", model.score(X_train, y_train))
predictions = model.predict(X_test)
r2score = r2_score(y_test, predictions)
print("r2 score is: ", r2score)
print('MAE:', mean_absolute_error(y_test, predictions))
print('MSE:', mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(mean_squared_error(y_test, predictions)))
# 真实值和预测值的差值
sns.distplot(y_test - predictions)
# 构建多元线性回归
from sklearn.linear_model import LinearRegression
lg = LinearRegression()
train_model(lg)
# 构建 KNN 回归
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()
train_model(knn)
# 构建决策树回归
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor()
train_model(tree)
# 构建随机森林回归
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor()
train_model(forest)
# GBDT 回归
from sklearn.ensemble import GradientBoostingRegressor
gbdt = GradientBoostingRegressor()
train_model(gbdt)
# 打印特征重要性评分
feat_labels = X_train.columns[0:]
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
index_list = []
value_list = []
for f, j in zip(range(X_train.shape[1]), indices):
index_list.append(feat_labels[j])
value_list.append(importances[j])
print(f + 1, feat_labels[j], importances[j])
plt.figure(figsize=(10, 6))
plt.barh(index_list[::-1], value_list[::-1])
plt.yticks(fontsize=12)
plt.title('各特征重要程度排序', fontsize=14)
plt.show()
# 使用随机森林模型预测并可视化
plt.figure(figsize=(10, 6))
y_pred = forest.predict(X_test)
plt.plot(range(len(y_test))[:200], y_pred[:200], 'b', label='预测值')
plt.plot(range(len(y_test))[:200], y_test[:200], 'r', label='真实值')
plt.legend(loc='upper right', fontsize=15)
plt.xlabel('the number of job', fontdict={'weight': 'normal', 'size': 15})
plt.ylabel('value of salary', fontdict={'weight': 'normal', 'size': 15})
plt.show()
Model is: LinearRegression() Training score: 0.25108271730067955 r2 score is: 0.26134514535068776 MAE: 44577.27978081522 MSE: 3122210075.1681466 RMSE: 55876.74001915418 Model is: KNeighborsRegressor() Training score: 0.3606268138549309 r2 score is: 0.24700684160122377 MAE: 44390.68830263813 MSE: 3182816454.6505523 RMSE: 56416.455530727486 Model is: DecisionTreeRegressor() Training score: 0.49404076237549877 r2 score is: 0.2807020525561107 MAE: 43266.20909923051 MSE: 3040390629.5631223 RMSE: 55139.7373004544 Model is: RandomForestRegressor() Training score: 0.48411775767688514 r2 score is: 0.3385299225272602 MAE: 41906.41949422298 MSE: 2795958799.0363255 RMSE: 52876.82667328218 Model is: GradientBoostingRegressor() Training score: 0.37622316939761435 r2 score is: 0.35797442385156075 MAE: 41206.14697020939 MSE: 2713769103.0514874 RMSE: 52093.84899440132 1 job_title 0.3298787815857886 2 experience_rank 0.28634672346700385 3 employee_residence 0.1835692231085222 4 work_year 0.06424054407384938 5 company_location 0.04822621776896799 6 remote_ratio 0.03983978142574876 7 company_size 0.022013136070875086 8 experience_level 0.021152358756413435 9 employment_type 0.004733233742830804
In [ ]: