import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv("salaries.csv")
df.head()
df.shape
df.info()
df.describe()
df.describe(include='O')
df.isnull().sum()
df.duplicated().sum()
df.drop_duplicates(inplace=True)
df.duplicated().sum()
# 工资分布直方图
plt.figure(figsize=(8, 6))
sns.histplot(data=df, x='salary_in_usd', kde=True)
plt.title('Distribution of Salaries (USD)')
plt.xlabel('Salary (USD)')
plt.ylabel('Frequency')
plt.show()
# -工资分布似乎是右偏的，长尾倾向于更高的工资。
# -大部分人的工资在101,000 - 186,000美元之间。
# -工资中位数约为14.13万美元，这表明有一半的工资高于/低于此值。
# -有一些异常值的薪水很高，这可能需要进一步调查。

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16494 entries, 0 to 16493
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           16494 non-null  int64 
 1   experience_level    16494 non-null  object
 2   employment_type     16494 non-null  object
 3   job_title           16494 non-null  object
 4   salary              16494 non-null  int64 
 5   salary_currency     16494 non-null  object
 6   salary_in_usd       16494 non-null  int64 
 7   employee_residence  16494 non-null  object
 8   remote_ratio        16494 non-null  int64 
 9   company_location    16494 non-null  object
 10  company_size        16494 non-null  object
dtypes: int64(4), object(7)
memory usage: 1.4+ MB

# 前20个职位标题的条形图
top_20_titles = df['job_title'].value_counts().head(20)
# 为排名前20的职位创建一个条形图
plt.figure(figsize=(12, 6))
plt.bar(top_20_titles.index, top_20_titles.values)
plt.xlabel('Job Title')
plt.ylabel('Count')
plt.title('Top 20 Job Titles')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
# 该柱状图揭示了以下见解:
# 在前20名中，最常见的职位是数据工程师，这表明对该职位的需求很高。
# 其他突出的职位包括数据科学家、数据分析师和机器学习工程师，这表明它们在数据集中的重要性。
# 排名前20的职位出现的频率各不相同，有些职位的出现频率明显高于其他职位。

# 职位名称词云
top_20_titles = df['job_title'].value_counts().head(20)
title_counts = dict(top_20_titles)
from wordcloud import WordCloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(title_counts)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Top 20 - Job Title Word Cloud')
plt.show()

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[5], line 9
      5 title_counts = dict(top_20_titles)
      7 from wordcloud import WordCloud
----> 9 wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(title_counts)
     11 plt.figure(figsize=(10, 6))
     13 plt.imshow(wordcloud, interpolation='bilinear')

File D:\Python310\lib\site-packages\wordcloud\wordcloud.py:453, in WordCloud.generate_from_frequencies(self, frequencies, max_font_size)
    451     font_size = self.height
    452 else:
--> 453     self.generate_from_frequencies(dict(frequencies[:2]),
    454                                    max_font_size=self.height)
    455     # find font sizes
    456     sizes = [x[1] for x in self.layout_]

File D:\Python310\lib\site-packages\wordcloud\wordcloud.py:508, in WordCloud.generate_from_frequencies(self, frequencies, max_font_size)
    505 transposed_font = ImageFont.TransposedFont(
    506     font, orientation=orientation)
    507 # get size of resulting text
--> 508 box_size = draw.textbbox((0, 0), word, font=transposed_font, anchor="lt")
    509 # find possible places using integral image:
    510 result = occupancy.sample_position(box_size[3] + self.margin,
    511                                    box_size[2] + self.margin,
    512                                    random_state)

File D:\Python310\lib\site-packages\PIL\ImageDraw.py:671, in ImageDraw.textbbox(self, xy, text, font, anchor, spacing, align, direction, features, language, stroke_width, embedded_color)
    669     font = self.getfont()
    670 if not isinstance(font, ImageFont.FreeTypeFont):
--> 671     raise ValueError("Only supported for TrueType fonts")
    672 mode = "RGBA" if embedded_color else self.fontmode
    673 bbox = font.getbbox(
    674     text, mode, direction, features, language, stroke_width, anchor
    675 )

ValueError: Only supported for TrueType fonts

# 前10名职位树图-树状图使用嵌套的矩形来表示每个职位的相对频率。
job_title_counts = df['job_title'].value_counts().head(10)
import squarify
plt.figure(figsize=(12, 6))
squarify.plot(sizes=job_title_counts, label=job_title_counts.index, alpha=0.8)
plt.axis('off')
plt.title('Job Title Treemap')
plt.show()

# 公司规模分析
# 变量
size=df['company_size']
salary=df['salary_in_usd']
#公司规模
print("Value Counts by Company Size:")
company_size_counts = df['company_size'].value_counts()
print(company_size_counts)
# 计算组均值和中位数
print("Mean Salaries by Company Size:")
print(df.groupby('company_size')['salary_in_usd'].mean().to_string(index=True))
print("\nMedian Salaries by Company Size:")
print(df.groupby('company_size')['salary_in_usd'].median().to_string(index=True))
#散点图
plt.figure(figsize=(16, 6))
plt.scatter(salary,size)
plt.xlabel('Salary in USD')
plt.ylabel('Company Size')
plt.title('Salary vs. Company Size')
plt.show()

Value Counts by Company Size:
company_size
M    9265
L     641
S     187
Name: count, dtype: int64
Mean Salaries by Company Size:
company_size
L    121670.976599
M    149588.921749
S     86780.197861

Median Salaries by Company Size:
company_size
L    108000.0
M    140250.0
S     72000.0

#箱型图
plt.figure(figsize=(10, 6))
box_plot_data = [df[df['company_size'] == size]['salary_in_usd'] for size in ['S', 'M', 'L']]
box_plot = plt.boxplot(box_plot_data, patch_artist=True)
colors = ['lightblue', 'lightgreen', 'pink']
for patch, color in zip(box_plot['boxes'], colors):
    patch.set_facecolor(color)
plt.xticks([1, 2, 3], ['Small', 'Medium', 'Large'])
plt.xlabel('Company Size')
plt.ylabel('Salary (USD)')
plt.title('Salary Distribution by Company Size')
plt.legend(box_plot['boxes'], ['Small', 'Medium', 'Large'], title='Company Size', loc='upper right')
plt.show()
# 中型企业的平均工资和中位数在这三类企业中是最高的。
# 大公司的平均工资和中位数是第二高的。
# 小公司的平均工资和中位数最低。
# 所有公司规模的工资中位数都低于平均工资，这表明工资分布略有右倾斜(即，有一些高异常值将平均值向上拉)。

# 按经验水平分析工资分配
# 计算每个经验级别的工资中位数
experience_salaries = df.groupby('experience_level')['salary_in_usd'].median().reset_index()
# 按工资中位数降序排序经验水平
experience_salaries_sorted = experience_salaries.sort_values('salary_in_usd', ascending=False)
# 将经验等级与原始数据合并
df_Ranked = pd.merge(df, experience_salaries_sorted, on='experience_level')
# 创建了一个字典，将经验等级映射到他们的等级
experience_rank = {'EN': 0, 'MI': 1, 'SE': 2, 'EX': 3}
df['experience_rank'] = df['experience_level'].map(experience_rank)
print("Experience Levels Ranked by Median Salary:")
print(experience_salaries_sorted)
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='experience_level', y='salary_in_usd')
plt.title('Salary Distribution by Experience Level')
plt.xlabel('Experience Level')
plt.ylabel('Salary (USD)')
plt.xticks([0, 1, 2, 3], ['EN', 'MI', 'SE', 'EX'])
plt.show()
# 按工资中位数对经验等级排序¶
# 首先，我们根据他们的工资中位数对经验水平进行排名。这让我们初步了解了不同经验水平的薪水是如何不同的。
# 输出结果显示，经验等级排序如下:
# EX(行政人员)，平均工资为18万美元
# SE(高级)，工资中位数为165,000美元
# MI(中级)，工资中位数为129,900美元
# EN(入门级)，工资中位数为85,750美元

Experience Levels Ranked by Median Salary:
  experience_level  salary_in_usd
1               EX       192000.0
3               SE       154400.0
2               MI       112000.0
0               EN        80455.5

# 按经验水平及受雇类别划分的薪金分布
plt.figure(figsize=(12, 6))
sns.set_color_codes("pastel")
sns.barplot(x='experience_level', y='salary_in_usd', hue='employment_type', data=df,)
plt.title('Salary Distribution by Experience Level and Employment Type')
plt.xlabel('Experience Level')
plt.ylabel('Salary (USD)')
plt.legend(title='Employment Type')
plt.show()

# 历年按经验水平划分的平均工资
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='work_year', y='salary_in_usd', hue='experience_level', estimator='mean', ci=None)
plt.title('Average salaries by level of experience over the years')
plt.xlabel('Year')
plt.ylabel('Salary (USD)')
plt.legend(title='Experience level', labels=['EN', 'MI', 'SE', 'EX'])
plt.show()

# 历年的薪资趋势
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='work_year', y='salary_in_usd', estimator='mean', ci=None)
plt.title('Salary trends over the years')
plt.xlabel('Year')
plt.ylabel('Salary (USD)')
plt.show()

# 按公司规模划分的历年平均工资
plt.figure(figsize=(10, 6))
df.groupby(['work_year', 'company_size'])['salary_in_usd'].mean().unstack().plot(kind='line', marker='o')
plt.title('Average Salaries by Company Size Over the Years')
plt.xlabel('Year')
plt.ylabel('Average salary (USD)')
plt.legend(title='Company size')
plt.show()

<Figure size 1000x600 with 0 Axes>

# 相关性分析
numeric_columns = df.select_dtypes(include=['number'])
correlation_matrix = numeric_columns.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()
df.head()
new_df = df.drop(['salary','salary_currency'],axis=1)
new_df.head()

# 相关性分析
numeric_columns = df.select_dtypes(include=['number'])
correlation_matrix = numeric_columns.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()
df.head()
new_df = df.drop(['salary', 'salary_currency'], axis=1)
new_df.head()
from sklearn.preprocessing import LabelEncoder
for col in new_df.describe(include='O'):
    new_df[col] = LabelEncoder().fit_transform(new_df[col])
new_df.head()
sns.boxplot(data=new_df, y='salary_in_usd')
new_df = new_df[new_df['salary_in_usd'] < 400000]
from sklearn.model_selection import train_test_split
X = new_df.drop('salary_in_usd', axis=1)
y = new_df['salary_in_usd']
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
import seaborn as sns
import matplotlib.pylab as plt
plt.rcParams['font.sans-serif'] = ['SimHei']  # 解决中文显示
plt.rcParams['axes.unicode_minus'] = False    # 解决符号无法显示
sns.set(font='SimHei')
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np
# 定义一个训练模型并对模型各个指标进行评估的函数
def train_model(ml_model):
    print("Model is: ", ml_model)
    model = ml_model.fit(X_train, y_train)
    print("Training score: ", model.score(X_train, y_train))
    predictions = model.predict(X_test)
    r2score = r2_score(y_test, predictions)
    print("r2 score is: ", r2score)
    print('MAE:', mean_absolute_error(y_test, predictions))
    print('MSE:', mean_squared_error(y_test, predictions))
    print('RMSE:', np.sqrt(mean_squared_error(y_test, predictions)))
    # 真实值和预测值的差值
    sns.distplot(y_test - predictions)
# 构建多元线性回归
from sklearn.linear_model import LinearRegression
lg = LinearRegression()
train_model(lg)
# 构建 KNN 回归
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()
train_model(knn)
# 构建决策树回归
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor()
train_model(tree)
# 构建随机森林回归
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor()
train_model(forest)
# GBDT 回归
from sklearn.ensemble import GradientBoostingRegressor
gbdt = GradientBoostingRegressor()
train_model(gbdt)
# 打印特征重要性评分
feat_labels = X_train.columns[0:]
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
index_list = []
value_list = []
for f, j in zip(range(X_train.shape[1]), indices):
    index_list.append(feat_labels[j])
    value_list.append(importances[j])
    print(f + 1, feat_labels[j], importances[j])
plt.figure(figsize=(10, 6))
plt.barh(index_list[::-1], value_list[::-1])
plt.yticks(fontsize=12)
plt.title('各特征重要程度排序', fontsize=14)
plt.show()
# 使用随机森林模型预测并可视化
plt.figure(figsize=(10, 6))
y_pred = forest.predict(X_test)
plt.plot(range(len(y_test))[:200], y_pred[:200], 'b', label='预测值')
plt.plot(range(len(y_test))[:200], y_test[:200], 'r', label='真实值')
plt.legend(loc='upper right', fontsize=15)
plt.xlabel('the number of job', fontdict={'weight': 'normal', 'size': 15})
plt.ylabel('value of salary', fontdict={'weight': 'normal', 'size': 15})
plt.show()

Model is:  LinearRegression()
Training score:  0.25108271730067955
r2 score is:  0.26134514535068776
MAE: 44577.27978081522
MSE: 3122210075.1681466
RMSE: 55876.74001915418
Model is:  KNeighborsRegressor()
Training score:  0.3606268138549309
r2 score is:  0.24700684160122377
MAE: 44390.68830263813
MSE: 3182816454.6505523
RMSE: 56416.455530727486
Model is:  DecisionTreeRegressor()
Training score:  0.49404076237549877
r2 score is:  0.2807020525561107
MAE: 43266.20909923051
MSE: 3040390629.5631223
RMSE: 55139.7373004544
Model is:  RandomForestRegressor()
Training score:  0.48411775767688514
r2 score is:  0.3385299225272602
MAE: 41906.41949422298
MSE: 2795958799.0363255
RMSE: 52876.82667328218
Model is:  GradientBoostingRegressor()
Training score:  0.37622316939761435
r2 score is:  0.35797442385156075
MAE: 41206.14697020939
MSE: 2713769103.0514874
RMSE: 52093.84899440132
1 job_title 0.3298787815857886
2 experience_rank 0.28634672346700385
3 employee_residence 0.1835692231085222
4 work_year 0.06424054407384938
5 company_location 0.04822621776896799
6 remote_ratio 0.03983978142574876
7 company_size 0.022013136070875086
8 experience_level 0.021152358756413435
9 employment_type 0.004733233742830804

	work_year	experience_level	employment_type	job_title	salary_in_usd	employee_residence	company_location	company_size	experience_rank
0	2024	MI	FT	Data Scientist	120000	AU	AU	S	1
1	2024	MI	FT	Data Scientist	70000	AU	AU	S	1
2	2024	MI	CT	Data Scientist	130000	US	US	M	1
3	2024	MI	CT	Data Scientist	110000	US	US	M	1
4	2024	MI	FT	Data Science Manager	240000	US	US	M	1

📘 机器学习工程师薪资数据可视化/工资.ipynb