📘 爬虫+机器学习技术的杭州租房价格预测建模研究/Hangzhou.ipynb

Notebook
def get_details(url): headers['referer'] = 'referer: http://search.fang.com/' resp = requests.get(url,headers=headers) tree = etree.HTML(resp.text) # 城市 city_ = city # 房屋租金 try: house_price = tree.xpath('/html/body/div[5]/div[1]/div[5]/div[1]/div/i/text()')[0] + '元/月' except: house_price = '暂无数据' # 交付方式 try: pay_type = tree.xpath('/html/body/div[5]/div[1]/div[5]/div[1]/div/a/text()')[0] except: pay_type = '暂无数据' # 出租方式 try: hire_style = tree.xpath('/html/body/div[5]/div[1]/div[5]/div[2]/div[1]/div[1]/a/text()')[0] except: hire_style = '暂无数据' # 房屋户型 try: house_type = tree.xpath('/html/body/div[5]/div[1]/div[5]/div[2]/div[2]/div[1]/text()')[0] except: house_type = '暂无数据' # 房屋面积 try: house_area = tree.xpath('/html/body/div[5]/div[1]/div[5]/div[2]/div[3]/div[1]/text()')[0] except: house_area = '暂无数据' # 房屋朝向 try: house_direct = tree.xpath('/html/body/div[5]/div[1]/div[5]/div[3]/div[1]/div[1]/text()')[0] except: house_direct = '暂无数据' # 楼层 try: floor = tree.xpath('/html/body/div[5]/div[1]/div[5]/div[3]/div[2]/div[1]/a/text()')[0] except: floor = '暂无数据' # 房屋装修 try: house_dec = tree.xpath('/html/body/div[5]/div[1]/div[5]/div[3]/div[3]/div[1]/a/text()')[0] except: house_dec = '暂无数据' # 小区 try: xiaoqu = tree.xpath('//*[@id="agantzfxq_C02_07"]/text()')[0] except: xiaoqu = '暂无数据' # 距地铁距离 try: subway_meter = tree.xpath('/html/body/div[5]/div[1]/div[5]/div[4]/div[2]/div/a/text()')[0] except: subway_meter = '暂无数据' # 地址 try: place = tree.xpath('/html/body/div[5]/div[1]/div[5]/div[4]/div[3]/div[2]/a/text()')[0] except: place = '暂无数据' # 配套设施 try: other_fac_list = tree.xpath('/html/body/div[5]/div[2]/div[1]/div[2]/div[2]/ul/li/text()') other_fac = ' '.join(other_fac_list) except: other_fac = '暂无数据 ' # 房源亮点 try: house_light = tree.xpath('/html/body/div[5]/div[2]/div[1]/div[1]/div[2]/div/div/ul/li[1]/div[2]/text()')[0] except: house_light = '暂无数据'def main(page,f): city_en='hz' url = f'https://{city_en}.zu.fang.com/house/i3{page}/?rfss=1-9988c4a227ce113113-a6' resp = requests.get(url,headers=headers) tree = etree.HTML(resp.text) dl_list = tree.xpath('//div[@class="houseList"]/dl') print(len(dl_list)) if len(dl_list) == 0: print('IP异常!验证码警告!请返回官网刷新验证码!') raise else: for dl in dl_list: href = dl.xpath('./dt/a/@href')[0] href = f'https://{city_en}.zu.fang.com' + href try: get_details(href) f.flush() time.sleep(random.random()) except Exception as e: print(e) pass