In [1]:
import matplotlib.pylab as plt
import numpy as np
import seaborn as sns
import pandas as pd
plt.rcParams['font.sans-serif'] = ['SimHei'] #解决中文显示
plt.rcParams['axes.unicode_minus'] = False #解决符号无法显示
sns.set(font='SimHei')
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('杭州租房数据.csv')
df.head()
df.shape
df.info()
df.describe().T
df.isnull().sum() # 统计缺失值情况
df.duplicated().sum() # 统计重复数据情况
df.dropna(inplace=True) # 删除缺失值
df.drop_duplicates(inplace=True) # 删除重复值
df.shape
df['房屋租金'] = df['房屋租金'].apply(lambda x:int(x.split('元')[0]))
df['房屋租金']
<class 'pandas.core.frame.DataFrame'> RangeIndex: 257 entries, 0 to 256 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 城市 257 non-null object 1 房屋租金 257 non-null object 2 交付方式 257 non-null object 3 出租方式 257 non-null object 4 房屋户型 257 non-null object 5 房屋面积 257 non-null object 6 房屋朝向 257 non-null object 7 楼层 257 non-null object 8 房屋装修 257 non-null object 9 小区 257 non-null object 10 距地铁距离 257 non-null object 11 地址 257 non-null object 12 配套设施 256 non-null object 13 房源亮点 257 non-null object dtypes: object(14) memory usage: 28.2+ KB
Out[1]:
0 11000
1 8000
2 13500
3 8505
4 4000
...
251 3500
252 3800
253 4500
254 6100
256 6999
Name: 房屋租金, Length: 221, dtype: int64
In [2]:
df['房屋面积'] = df['房屋面积'].apply(lambda x:int(x[:-2]))
df['房屋面积']
Out[2]:
0 256
1 110
2 177
3 168
4 61
...
251 89
252 88
253 88
254 87
256 138
Name: 房屋面积, Length: 221, dtype: int64
In [3]:
sns.boxplot(data=df,x='房屋租金')
plt.show()
In [4]:
sns.histplot(data=df,x='房屋租金',kde=True)
plt.show()
In [5]:
sns.boxplot(data=df,y='房屋面积')
plt.show()
In [6]:
sns.histplot(data=df,x='房屋面积',kde=True)
plt.show()
plt.scatter(x=df['房屋面积'],y=df['房屋租金'])
plt.show()
df['交付方式'].value_counts()
sns.countplot(data=df,x='交付方式')
plt.show()
df['出租方式'].value_counts().plot(kind='pie',autopct='%.2f%%')
plt.show()
sns.boxplot(data=df,y='房屋租金',x='交付方式')
plt.show()
sns.boxplot(data=df,y='房屋租金',x='出租方式')
plt.show()
df['房屋朝向'].value_counts().plot(kind='pie',autopct='%.2f%%')
plt.show()
sns.barplot(data=df,x='房屋朝向',y='房屋租金')
plt.show()
In [7]:
def subway_distance(x):
try:
result = x.split('约')[1].split('米')[0]
except:
result = 0
return int(result)
In [8]:
df['距地铁距离'] = df['距地铁距离'].apply(subway_distance)
df['距地铁距离']
plt.scatter(x=df['距地铁距离'],y=df['房屋租金'])
plt.show()
df['楼层'].value_counts().plot(kind='pie',autopct='%.2f%%')
plt.show()
sns.barplot(data=df,x='楼层',y='房屋租金')
plt.show()
df['房屋装修'].value_counts()
df['房屋装修'].value_counts().plot(kind='pie',autopct='%.2f%%')
plt.show()
sns.barplot(data=df,x='房屋装修',y='房屋租金')
plt.show()
In [10]:
import seaborn as sns
import matplotlib.pyplot as plt
# 只选择数值型列
numeric_df = df.select_dtypes(include=['number'])
# 计算相关性并绘制热图
sns.heatmap(numeric_df.corr(), vmax=1, annot=True, linewidths=0.5, cbar=False, cmap='YlGnBu', annot_kws={'fontsize': 18})
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.show()
In [17]:
import jieba
import collections
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from PIL import Image
def draw_WorldCloud(df, pic_name, color='black'):
# 将 DataFrame 中的所有文本数据合并为一个字符串
data = ''.join([str(item) for item in df])
# 文本预处理:去除一些无用的字符,只提取出中文
new_data = re.findall('[\u4e00-\u9fa5]+', data, re.S)
new_data = "".join(new_data)
# 文本分词
seg_list_exact = jieba.cut(new_data, cut_all=True)
result_list = []
# 读取停用词库
with open('停用词库.txt', encoding='utf-8') as f:
con = f.readlines()
stop_words = set()
for i in con:
i = i.replace("\n", "")
stop_words.add(i)
# 过滤停用词
for word in seg_list_exact:
if word not in stop_words and len(word) > 1:
result_list.append(word)
# 词频统计:获取前100个最高频的词
word_counts = collections.Counter(result_list)
word_counts_top = word_counts.most_common(100)
print(word_counts_top)
# 使用 WordCloud 生成词云图
wordcloud = WordCloud(
font_path=r'C:\Windows\Fonts\msyh.ttc',
width=800,
height=400,
background_color=color,
max_words=2000,
max_font_size=150
).generate(' '.join(result_list[:500]))
# 显示词云图
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off') # 不显示坐标轴
plt.savefig(f'{pic_name}.png') # 保存图片
plt.show() # 展示图片
# 调用函数进行词云图可视化
draw_WorldCloud(df['房源亮点'], '房源亮点词云图') # 词云图可视化
[('房子', 80), ('小区', 72), ('押金', 56), ('房源', 55), ('选择', 55), ('价格', 54), ('入住', 52), ('环境', 48), ('装修', 41), ('精装', 40), ('开发', 39), ('房东', 38), ('开发商', 38), ('出租', 37), ('租价', 34), ('诚心', 34), ('保养', 34), ('可以', 33), ('精装修', 33), ('修配', 32), ('配置', 32), ('高小', 32), ('不二', 32), ('未来', 31), ('房屋', 29), ('包含', 28), ('地铁', 27), ('实地', 25), ('位置', 24), ('齐全', 24), ('核心', 24), ('配套', 23), ('科技', 22), ('经验', 20), ('西溪', 20), ('生活', 19), ('介绍', 19), ('多年', 19), ('开始', 19), ('经纪', 19), ('一个', 19), ('服务', 19), ('便利', 18), ('方便', 18), ('拍摄', 18), ('近地', 18), ('核心区', 18), ('只是', 18), ('诚实', 18), ('经纪人', 18), ('省去', 18), ('很多', 18), ('不必', 18), ('不必要', 18), ('必要', 18), ('麻烦', 18), ('行业', 18), ('从业', 18), ('业经', 18), ('十多', 18), ('十多年', 18), ('不会', 18), ('失望', 18), ('户型', 16), ('欢迎', 16), ('拎包', 16), ('十分', 15), ('本人', 15), ('南北', 15), ('套房', 14), ('推荐', 13), ('真实', 12), ('交通', 12), ('客户', 12), ('光临', 11), ('提供', 11), ('承诺', 11), ('店铺', 10), ('欢迎您', 10), ('您好', 9), ('高兴', 9), ('绿化', 9), ('滨江', 9), ('南房', 9), ('西南', 9), ('时间', 8), ('随时', 8), ('房产', 8), ('地铁口', 8), ('铁口', 8), ('就是', 8), ('地勘', 7), ('勘察', 7), ('理由', 7), ('华夏', 7), ('四季', 7), ('国际', 7), ('中心', 7), ('心室', 7), ('希望', 6)]
In [18]:
# 特征筛选
new_df = df[['房屋租金', '交付方式', '出租方式', '房屋户型', '房屋面积', '房屋朝向', '楼层', '房屋装修','距地铁距离']]
new_df
Out[18]:
| 房屋租金 | 交付方式 | 出租方式 | 房屋户型 | 房屋面积 | 房屋朝向 | 楼层 | 房屋装修 | 距地铁距离 | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 11000 | 押一付三 | 整租 | 4室2厅4卫 | 256 | 南北 | 低层 | 精装修 | 665 |
| 1 | 8000 | 押一付三 | 整租 | 2室2厅1卫 | 110 | 南 | 中层 | 精装修 | 235 |
| 2 | 13500 | 押一付三 | 整租 | 4室2厅2卫 | 177 | 南 | 中层 | 精装修 | 227 |
| 3 | 8505 | 押一付三 | 整租 | 3室2厅2卫 | 168 | 南 | 低层 | 精装修 | 671 |
| 4 | 4000 | 押一付三 | 整租 | 3室1厅1卫 | 61 | 南 | 暂无数据 | 简装修 | 246 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 251 | 3500 | 押一付三 | 整租 | 3室2厅2卫 | 89 | 南北 | 中层 | 精装修 | 0 |
| 252 | 3800 | 押一付三 | 整租 | 3室2厅1卫 | 88 | 南北 | 高层 | 精装修 | 1012 |
| 253 | 4500 | 面议 | 整租 | 3室2厅1卫 | 88 | 南北 | 中层 | 精装修 | 1092 |
| 254 | 6100 | 押一付三 | 整租 | 3室1厅1卫 | 87 | 南北 | 中层 | 精装修 | 469 |
| 256 | 6999 | 押一付三 | 整租 | 4室1厅2卫 | 138 | 南北 | 高层 | 精装修 | 0 |
221 rows × 9 columns
In [20]:
# 编码处理
from sklearn.preprocessing import LabelEncoder
for col in new_df.describe(include='O').columns.to_list():
new_df[col] = LabelEncoder().fit_transform(new_df[col])
new_df
Out[20]:
| 房屋租金 | 交付方式 | 出租方式 | 房屋户型 | 房屋面积 | 房屋朝向 | 楼层 | 房屋装修 | 距地铁距离 | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 11000 | 2 | 0 | 16 | 256 | 3 | 1 | 2 | 665 |
| 1 | 8000 | 2 | 0 | 3 | 110 | 2 | 0 | 2 | 235 |
| 2 | 13500 | 2 | 0 | 14 | 177 | 2 | 0 | 2 | 227 |
| 3 | 8505 | 2 | 0 | 8 | 168 | 2 | 1 | 2 | 671 |
| 4 | 4000 | 2 | 0 | 5 | 61 | 2 | 2 | 1 | 246 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 251 | 3500 | 2 | 0 | 8 | 89 | 3 | 0 | 2 | 0 |
| 252 | 3800 | 2 | 0 | 7 | 88 | 3 | 4 | 2 | 1012 |
| 253 | 4500 | 4 | 0 | 7 | 88 | 3 | 0 | 2 | 1092 |
| 254 | 6100 | 2 | 0 | 5 | 87 | 3 | 0 | 2 | 469 |
| 256 | 6999 | 2 | 0 | 12 | 138 | 3 | 4 | 2 | 0 |
221 rows × 9 columns
In [21]:
from sklearn.model_selection import train_test_split
# 准备数据
X = new_df.drop('房屋租金',axis=1)
y = new_df['房屋租金']
# 划分数据集
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
print('训练集大小:',X_train.shape[0])
print('测试集大小:',X_test.shape[0])
训练集大小: 176 测试集大小: 45
In [28]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
# 准备数据
X = new_df.drop('房屋租金', axis=1)
y = new_df['房屋租金']
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('训练集大小:', X_train.shape[0])
print('测试集大小:', X_test.shape[0])
# 定义一个训练模型并输出模型的评估指标
def train_model(ml_model):
print("Model is: ", ml_model)
model = ml_model.fit(X_train, y_train)
print("Training score: ", model.score(X_train, y_train))
predictions = model.predict(X_test)
r2score = r2_score(y_test, predictions)
print("r2 score is: ", r2score)
print('MAE:', mean_absolute_error(y_test, predictions))
print('MSE:', mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(mean_squared_error(y_test, predictions)))
# 真实值和预测值的差值
sns.distplot(y_test - predictions)
plt.show() # 确保展示图像
# 构建和训练多种模型
models = {
"线性回归": LinearRegression(),
"KNN回归": KNeighborsRegressor(),
"决策树回归": DecisionTreeRegressor(),
"随机森林回归": RandomForestRegressor(),
"GBDT回归": GradientBoostingRegressor(),
"XGBoost回归": XGBRegressor()
}
for model_name, model in models.items():
print(f"正在训练: {model_name}")
train_model(model)
# 特征重要性评分
feat_labels = X_train.columns
importances = models["XGBoost回归"].feature_importances_
indices = np.argsort(importances)[::-1]
index_list = []
value_list = []
for f, j in zip(range(X_train.shape[1]), indices):
index_list.append(feat_labels[j])
value_list.append(importances[j])
print(f"{f + 1}. {feat_labels[j]}: {importances[j]:.4f}")
# 绘制特征重要性
plt.figure(figsize=(10, 6))
plt.barh(index_list[::-1], value_list[::-1])
plt.yticks(fontsize=12)
plt.title('各特征重要程度排序', fontsize=14)
plt.show()
# 模型预测
y_pred = models["XGBoost回归"].predict(X_test)
result_df = pd.DataFrame()
result_df['真实值'] = y_test
result_df['预测值'] = y_pred
print(result_df.head(10))
# 模型预测可视化
plt.figure(figsize=(10, 6))
plt.plot(range(len(y_test))[:200], y_pred[:200], 'b', label='预测值')
plt.plot(range(len(y_test))[:200], y_test[:200], 'r', label='真实值')
plt.legend(loc='upper right', fontsize=15)
plt.xlabel('房屋数量', fontdict={'weight': 'normal', 'size': 15})
plt.ylabel('房价', fontdict={'weight': 'normal', 'size': 15})
plt.show()
训练集大小: 176 测试集大小: 45 正在训练: 线性回归 Model is: LinearRegression() Training score: 0.6283236932936127 r2 score is: -0.10016844383108725 MAE: 1730.2807846318938 MSE: 5091666.976076585 RMSE: 2256.4722413707163
正在训练: KNN回归 Model is: KNeighborsRegressor() Training score: 0.6538480769614943 r2 score is: 0.5230410571189587 MAE: 1078.7822222222221 MSE: 2207403.886222222 RMSE: 1485.7334505967824
正在训练: 决策树回归 Model is: DecisionTreeRegressor() Training score: 0.9995966865142927 r2 score is: 0.5721344228136658 MAE: 1117.0444444444445 MSE: 1980195.888888889 RMSE: 1407.1943323112444
正在训练: 随机森林回归 Model is: RandomForestRegressor() Training score: 0.9382871231630941 r2 score is: 0.5839769200479268 MAE: 969.8347407407408 MSE: 1925387.8706985186 RMSE: 1387.583464408004
正在训练: GBDT回归 Model is: GradientBoostingRegressor() Training score: 0.989019740895184 r2 score is: 0.46408665347428124 MAE: 995.3278571724436 MSE: 2480249.5507339197 RMSE: 1574.8808052465176
正在训练: XGBoost回归
Model is: XGBRegressor(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=None, ...)
Training score: 0.9995867426654103
r2 score is: 0.3472446333793353
MAE: 1194.8612874348958
MSE: 3021003.7038559965
RMSE: 1738.1034790414512
1. 房屋面积: 0.5794 2. 房屋户型: 0.1798 3. 交付方式: 0.0791 4. 房屋朝向: 0.0741 5. 楼层: 0.0495 6. 距地铁距离: 0.0361 7. 房屋装修: 0.0020 8. 出租方式: 0.0000
真实值 预测值 158 4460 3485.394531 174 3650 3865.016846 117 3300 3048.186035 210 3800 3299.256836 15 4150 1901.632690 141 5200 4504.125977 202 6000 7311.804688 244 6800 4243.289062 98 3580 2445.108887 168 6800 4417.584961
In [ ]: