import matplotlib.pylab as plt
import numpy as np
import seaborn as sns
import pandas as pd
plt.rcParams['font.sans-serif'] = ['SimHei'] #解决中文显示
plt.rcParams['axes.unicode_minus'] = False   #解决符号无法显示
sns.set(font='SimHei')
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('杭州租房数据.csv')
df.head()
df.shape
df.info()
df.describe().T
df.isnull().sum() # 统计缺失值情况
df.duplicated().sum()  # 统计重复数据情况
df.dropna(inplace=True) # 删除缺失值
df.drop_duplicates(inplace=True) # 删除重复值
df.shape
df['房屋租金'] = df['房屋租金'].apply(lambda x:int(x.split('元')[0]))
df['房屋租金']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 257 entries, 0 to 256
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   城市      257 non-null    object
 1   房屋租金    257 non-null    object
 2   交付方式    257 non-null    object
 3   出租方式    257 non-null    object
 4   房屋户型    257 non-null    object
 5   房屋面积    257 non-null    object
 6   房屋朝向    257 non-null    object
 7   楼层      257 non-null    object
 8   房屋装修    257 non-null    object
 9   小区      257 non-null    object
 10  距地铁距离   257 non-null    object
 11  地址      257 non-null    object
 12  配套设施    256 non-null    object
 13  房源亮点    257 non-null    object
dtypes: object(14)
memory usage: 28.2+ KB

0      11000
1       8000
2      13500
3       8505
4       4000
       ...  
251     3500
252     3800
253     4500
254     6100
256     6999
Name: 房屋租金, Length: 221, dtype: int64

df['房屋面积'] = df['房屋面积'].apply(lambda x:int(x[:-2]))
df['房屋面积']

0      256
1      110
2      177
3      168
4       61
      ... 
251     89
252     88
253     88
254     87
256    138
Name: 房屋面积, Length: 221, dtype: int64

sns.boxplot(data=df,x='房屋租金')
plt.show()

sns.histplot(data=df,x='房屋租金',kde=True)
plt.show()

sns.boxplot(data=df,y='房屋面积')
plt.show()

sns.histplot(data=df,x='房屋面积',kde=True)
plt.show()
plt.scatter(x=df['房屋面积'],y=df['房屋租金'])
plt.show()
df['交付方式'].value_counts()
sns.countplot(data=df,x='交付方式')
plt.show()
df['出租方式'].value_counts().plot(kind='pie',autopct='%.2f%%')
plt.show()
sns.boxplot(data=df,y='房屋租金',x='交付方式')
plt.show()
sns.boxplot(data=df,y='房屋租金',x='出租方式')
plt.show()
df['房屋朝向'].value_counts().plot(kind='pie',autopct='%.2f%%')
plt.show()
sns.barplot(data=df,x='房屋朝向',y='房屋租金')
plt.show()

def subway_distance(x):
    try:
        result = x.split('约')[1].split('米')[0]
    except:
        result = 0
    return int(result)

df['距地铁距离'] = df['距地铁距离'].apply(subway_distance)
df['距地铁距离']
plt.scatter(x=df['距地铁距离'],y=df['房屋租金'])
plt.show()
df['楼层'].value_counts().plot(kind='pie',autopct='%.2f%%')
plt.show()
sns.barplot(data=df,x='楼层',y='房屋租金')
plt.show()
df['房屋装修'].value_counts()
df['房屋装修'].value_counts().plot(kind='pie',autopct='%.2f%%')
plt.show()
sns.barplot(data=df,x='房屋装修',y='房屋租金')
plt.show()

import seaborn as sns
import matplotlib.pyplot as plt
# 只选择数值型列
numeric_df = df.select_dtypes(include=['number'])
# 计算相关性并绘制热图
sns.heatmap(numeric_df.corr(), vmax=1, annot=True, linewidths=0.5, cbar=False, cmap='YlGnBu', annot_kws={'fontsize': 18})
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.show()

import jieba
import collections
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from PIL import Image
def draw_WorldCloud(df, pic_name, color='black'):
    # 将 DataFrame 中的所有文本数据合并为一个字符串
    data = ''.join([str(item) for item in df])
    # 文本预处理：去除一些无用的字符，只提取出中文
    new_data = re.findall('[\u4e00-\u9fa5]+', data, re.S)
    new_data = "".join(new_data)
    # 文本分词
    seg_list_exact = jieba.cut(new_data, cut_all=True)
    result_list = []
    # 读取停用词库
    with open('停用词库.txt', encoding='utf-8') as f:
        con = f.readlines()
    stop_words = set()
    for i in con:
        i = i.replace("\n", "")
        stop_words.add(i)
    # 过滤停用词
    for word in seg_list_exact:
        if word not in stop_words and len(word) > 1:
            result_list.append(word)
    # 词频统计：获取前100个最高频的词
    word_counts = collections.Counter(result_list)
    word_counts_top = word_counts.most_common(100)
    print(word_counts_top)
    # 使用 WordCloud 生成词云图
    wordcloud = WordCloud(
        font_path=r'C:\Windows\Fonts\msyh.ttc',
        width=800,
        height=400,
        background_color=color,
        max_words=2000,
        max_font_size=150
    ).generate(' '.join(result_list[:500]))
    # 显示词云图
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')  # 不显示坐标轴
    plt.savefig(f'{pic_name}.png')  # 保存图片
    plt.show()  # 展示图片
# 调用函数进行词云图可视化
draw_WorldCloud(df['房源亮点'], '房源亮点词云图')  # 词云图可视化

[('房子', 80), ('小区', 72), ('押金', 56), ('房源', 55), ('选择', 55), ('价格', 54), ('入住', 52), ('环境', 48), ('装修', 41), ('精装', 40), ('开发', 39), ('房东', 38), ('开发商', 38), ('出租', 37), ('租价', 34), ('诚心', 34), ('保养', 34), ('可以', 33), ('精装修', 33), ('修配', 32), ('配置', 32), ('高小', 32), ('不二', 32), ('未来', 31), ('房屋', 29), ('包含', 28), ('地铁', 27), ('实地', 25), ('位置', 24), ('齐全', 24), ('核心', 24), ('配套', 23), ('科技', 22), ('经验', 20), ('西溪', 20), ('生活', 19), ('介绍', 19), ('多年', 19), ('开始', 19), ('经纪', 19), ('一个', 19), ('服务', 19), ('便利', 18), ('方便', 18), ('拍摄', 18), ('近地', 18), ('核心区', 18), ('只是', 18), ('诚实', 18), ('经纪人', 18), ('省去', 18), ('很多', 18), ('不必', 18), ('不必要', 18), ('必要', 18), ('麻烦', 18), ('行业', 18), ('从业', 18), ('业经', 18), ('十多', 18), ('十多年', 18), ('不会', 18), ('失望', 18), ('户型', 16), ('欢迎', 16), ('拎包', 16), ('十分', 15), ('本人', 15), ('南北', 15), ('套房', 14), ('推荐', 13), ('真实', 12), ('交通', 12), ('客户', 12), ('光临', 11), ('提供', 11), ('承诺', 11), ('店铺', 10), ('欢迎您', 10), ('您好', 9), ('高兴', 9), ('绿化', 9), ('滨江', 9), ('南房', 9), ('西南', 9), ('时间', 8), ('随时', 8), ('房产', 8), ('地铁口', 8), ('铁口', 8), ('就是', 8), ('地勘', 7), ('勘察', 7), ('理由', 7), ('华夏', 7), ('四季', 7), ('国际', 7), ('中心', 7), ('心室', 7), ('希望', 6)]

# 特征筛选
new_df = df[['房屋租金', '交付方式', '出租方式', '房屋户型', '房屋面积', '房屋朝向', '楼层', '房屋装修','距地铁距离']]
new_df

# 编码处理
from sklearn.preprocessing import LabelEncoder
for col in new_df.describe(include='O').columns.to_list():
    new_df[col] = LabelEncoder().fit_transform(new_df[col])
new_df

from sklearn.model_selection import train_test_split
# 准备数据
X = new_df.drop('房屋租金',axis=1)
y = new_df['房屋租金']
# 划分数据集
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
print('训练集大小:',X_train.shape[0])
print('测试集大小:',X_test.shape[0])

训练集大小: 176
测试集大小: 45

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
# 准备数据
X = new_df.drop('房屋租金', axis=1)
y = new_df['房屋租金']
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('训练集大小:', X_train.shape[0])
print('测试集大小:', X_test.shape[0])
# 定义一个训练模型并输出模型的评估指标
def train_model(ml_model):
    print("Model is: ", ml_model)
    model = ml_model.fit(X_train, y_train)
    print("Training score: ", model.score(X_train, y_train))
    predictions = model.predict(X_test)
    r2score = r2_score(y_test, predictions)
    print("r2 score is: ", r2score)
    print('MAE:', mean_absolute_error(y_test, predictions))
    print('MSE:', mean_squared_error(y_test, predictions))
    print('RMSE:', np.sqrt(mean_squared_error(y_test, predictions)))
    # 真实值和预测值的差值
    sns.distplot(y_test - predictions)
    plt.show()  # 确保展示图像
# 构建和训练多种模型
models = {
    "线性回归": LinearRegression(),
    "KNN回归": KNeighborsRegressor(),
    "决策树回归": DecisionTreeRegressor(),
    "随机森林回归": RandomForestRegressor(),
    "GBDT回归": GradientBoostingRegressor(),
    "XGBoost回归": XGBRegressor()
}
for model_name, model in models.items():
    print(f"正在训练: {model_name}")
    train_model(model)
# 特征重要性评分
feat_labels = X_train.columns
importances = models["XGBoost回归"].feature_importances_
indices = np.argsort(importances)[::-1]
index_list = []
value_list = []
for f, j in zip(range(X_train.shape[1]), indices):
    index_list.append(feat_labels[j])
    value_list.append(importances[j])
    print(f"{f + 1}. {feat_labels[j]}: {importances[j]:.4f}")
# 绘制特征重要性
plt.figure(figsize=(10, 6))
plt.barh(index_list[::-1], value_list[::-1])
plt.yticks(fontsize=12)
plt.title('各特征重要程度排序', fontsize=14)
plt.show()
# 模型预测
y_pred = models["XGBoost回归"].predict(X_test)
result_df = pd.DataFrame()
result_df['真实值'] = y_test
result_df['预测值'] = y_pred
print(result_df.head(10))
# 模型预测可视化
plt.figure(figsize=(10, 6))
plt.plot(range(len(y_test))[:200], y_pred[:200], 'b', label='预测值')
plt.plot(range(len(y_test))[:200], y_test[:200], 'r', label='真实值')
plt.legend(loc='upper right', fontsize=15)
plt.xlabel('房屋数量', fontdict={'weight': 'normal', 'size': 15})
plt.ylabel('房价', fontdict={'weight': 'normal', 'size': 15})
plt.show()

训练集大小: 176
测试集大小: 45
正在训练: 线性回归
Model is:  LinearRegression()
Training score:  0.6283236932936127
r2 score is:  -0.10016844383108725
MAE: 1730.2807846318938
MSE: 5091666.976076585
RMSE: 2256.4722413707163

正在训练: KNN回归
Model is:  KNeighborsRegressor()
Training score:  0.6538480769614943
r2 score is:  0.5230410571189587
MAE: 1078.7822222222221
MSE: 2207403.886222222
RMSE: 1485.7334505967824

正在训练: 决策树回归
Model is:  DecisionTreeRegressor()
Training score:  0.9995966865142927
r2 score is:  0.5721344228136658
MAE: 1117.0444444444445
MSE: 1980195.888888889
RMSE: 1407.1943323112444

正在训练: 随机森林回归
Model is:  RandomForestRegressor()
Training score:  0.9382871231630941
r2 score is:  0.5839769200479268
MAE: 969.8347407407408
MSE: 1925387.8706985186
RMSE: 1387.583464408004

正在训练: GBDT回归
Model is:  GradientBoostingRegressor()
Training score:  0.989019740895184
r2 score is:  0.46408665347428124
MAE: 995.3278571724436
MSE: 2480249.5507339197
RMSE: 1574.8808052465176

正在训练: XGBoost回归
Model is:  XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)
Training score:  0.9995867426654103
r2 score is:  0.3472446333793353
MAE: 1194.8612874348958
MSE: 3021003.7038559965
RMSE: 1738.1034790414512

1. 房屋面积: 0.5794
2. 房屋户型: 0.1798
3. 交付方式: 0.0791
4. 房屋朝向: 0.0741
5. 楼层: 0.0495
6. 距地铁距离: 0.0361
7. 房屋装修: 0.0020
8. 出租方式: 0.0000

      真实值          预测值
158  4460  3485.394531
174  3650  3865.016846
117  3300  3048.186035
210  3800  3299.256836
15   4150  1901.632690
141  5200  4504.125977
202  6000  7311.804688
244  6800  4243.289062
98   3580  2445.108887
168  6800  4417.584961

	房屋租金	交付方式	出租方式	房屋户型	房屋面积	房屋朝向	楼层	房屋装修	距地铁距离
0	11000	押一付三	整租	4室2厅4卫	256	南北	低层	精装修	665
1	8000	押一付三	整租	2室2厅1卫	110	南	中层	精装修	235
2	13500	押一付三	整租	4室2厅2卫	177	南	中层	精装修	227
3	8505	押一付三	整租	3室2厅2卫	168	南	低层	精装修	671
4	4000	押一付三	整租	3室1厅1卫	61	南	暂无数据	简装修	246
...	...	...	...	...	...	...	...	...	...
251	3500	押一付三	整租	3室2厅2卫	89	南北	中层	精装修	0
252	3800	押一付三	整租	3室2厅1卫	88	南北	高层	精装修	1012
253	4500	面议	整租	3室2厅1卫	88	南北	中层	精装修	1092
254	6100	押一付三	整租	3室1厅1卫	87	南北	中层	精装修	469
256	6999	押一付三	整租	4室1厅2卫	138	南北	高层	精装修	0

📘 爬虫+机器学习技术的杭州租房价格预测建模研究/Fenxi.ipynb