# 实现多线程爬虫爬取某小说部分章节内容并以数据库存储（不少于10个章节。

import requests
from lxml import etree
import re
import pymysql
from time import sleep
from concurrent.futures import ThreadPoolExecutor


def get_conn():
    # 创建连接
    conn = pymysql.connect(host="127.0.0.1",
                           user="root",
                           password="686686",
                           db="novels",
                           charset="utf8")
    # 创建游标
    cursor = conn.cursor()
    return conn, cursor


def close_conn(conn, cursor):
    cursor.close()
    conn.close()


def get_xpath_resp(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
    resp = requests.get(url, headers=headers)
    tree = etree.HTML(resp.text)  # 用etree解析html
    return tree, resp


def get_chapters(url):
    tree, _ = get_xpath_resp(url)
    # 获取小说名字
    novel_name = tree.xpath('//*[@id="info"]/h1/text()')[0]
    # 获取小说数据节点
    dds = tree.xpath('/html/body/div[4]/dl/dd')
    title_list = []
    link_list = []
    for d in dds[:15]:
        title = d.xpath('./a/text()')[0]  # 章节标题
        title_list.append(title)
        link = d.xpath('./a/@href')[0]   # 章节链接
        chapter_url = url + link  # 构造完整链接
        link_list.append(chapter_url)
    return title_list, link_list, novel_name


def get_content(novel_name, title, url):
    try:
        cursor = None
        conn = None
        conn, cursor = get_conn()
        # 插入数据的sql
        sql = 'INSERT INTO novel(novel_name,chapter_name,content) VALUES(%s,%s,%s)'
        tree, resp = get_xpath_resp(url)
        # 获取内容
        content = re.findall('<div id="content">(.*?)</div>', resp.text)[0]
        # 对内容进行清洗
        content = content.replace('<br />', '\n').replace('&nbsp;', ' ').replace(
            '全本小说网 www.qb5.tw，最快更新<a href="https://www.qb5.tw/book_116659/">宇宙职业选手</a>最新章节！<br><br>', '')
        print(title, content)
        cursor.execute(sql, [novel_name, title, content])  # 插入数据
        conn.commit()  # 提交事务保存数据
    except:
        pass
    finally:
        sleep(2)
        close_conn(conn, cursor)  # 关闭数据库
if __name__ == '__main__':
    # 获取小说名字，标题链接，章节名称
    title_list, link_list, novel_name = get_chapters(
        'https://www.qb5.tw/book_116659/')
    with ThreadPoolExecutor(5) as t:  # 创建5个线程
        for title, link in zip(title_list, link_list):
            t.submit(get_content, novel_name, title, link)  # 启动线程

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[3], line 74
     71         close_conn(conn, cursor)  # 关闭数据库
     72 if __name__ == '__main__':
     73     # 获取小说名字，标题链接，章节名称
---> 74     title_list, link_list, novel_name = get_chapters(
     75         'https://www.qb5.tw/book_116659/')
     76     with ThreadPoolExecutor(5) as t:  # 创建5个线程
     77         for title, link in zip(title_list, link_list):

Cell In[3], line 37, in get_chapters(url)
     35 tree, _ = get_xpath_resp(url)
     36 # 获取小说名字
---> 37 novel_name = tree.xpath('//*[@id="info"]/h1/text()')[0]
     38 # 获取小说数据节点
     39 dds = tree.xpath('/html/body/div[4]/dl/dd')

IndexError: list index out of range

📘 七个爬虫小案例/小说爬取.ipynb