In [4]:
import requests
import csv
import time
import tqdm
# 全局 cookies 和 headers,避免重复初始化
COOKIES = {
'__jdu': '1764370966',
'shshshfpa': 'b983081b-d500-95cf-ce75-af324f42da15-1687070914',
'shshshfpx': 'b983081b-d500-95cf-ce75-af324f42da15-1687070914',
'unpl': 'JF8EALBnNSttCx5WBhgKSUURHg1SW1QJSR5XbW4NBlgISVcNGVUYRUN7XlVdWBRLFx9vYRRUWFNIXQ4eBisSEXteXVdZDEsWC2tXVgQFDQ8VXURJQlZAFDNVCV9dSRZRZjJWBFtdT1xWSAYYRRMfDlAKDlhCR1FpMjVkXlh7VAQrAh0VFUlbVVZaD3sWM2hXNWRYXU1UBxsyGiIRex8AAlgASxIGbCoFUlpdSVIEEwUcIhF7Xg',
'__jdv': '76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_bd3229cf0da748038e389c4a3293f2fb|1731824925131',
'3AB9D23F7A4B3CSS': 'jdd032NJPJXNWFKZ3XKKBVI7UYTJ7DJTTVMG7KWXCA4BCXYP4JMNCMCX7VGQPR2TI7PTAUHSBL4QXPEIIPHDNQZEZKJF2CYAAAAMTHDHTZ7QAAAAAC3V5VRVEVSDVAIX',
'_gia_d': '1',
'areaId': '12',
'jsavif': '1',
'wlfstk_smdl': 'e8c1tgskdw8mfgc4qo6ol9lc22pectbq',
'TrackID': '1RqsviCt6hbj4naUJ4-2X1Xo8yfQKHyJEC5oddM2b4mnLsJCE5dEogde0oVODb8FsXQngu1f2q2fo9UcLsNMA91HNex9bW77j-qEGNRnOGW4gO3ZxqHsdOG2fwAWwjy-g',
'thor': 'A75180384C3DD978F135FA9A5BFE0B24F1412B129B96128317E07EE5E2F53441426FDAF2C39DD257C5E37A4BB9A89DDEC80C1CA85B18EF0F6D33BDC78B55B38723578D8775A09A2C9CDDC2EBF79C49B702BCA05704CDB8CD7C68B3052B968E0B66CAFA39C3DB93C6B89AA0847733AFE11323CE6AA6956006B716A1E15F86969C3FF84A6CCD8086AEFF71E4455C44E3CC10AE723D7D61CFFEEF9AE98466060965',
'light_key': 'AASBKE7rOxgWQziEhC_QY6ya7oCXdforERpniNIwfN9U2KSgVpNVKAgbS4Qk0SIZyRIqL2W9',
'pinId': '0DRQUAoQkuOE-51lGRDIcg',
'pin': 'jd_zwIdfRmYOAep',
'unick': '%E7%A7%8B%E5%B0%8F%E6%9E%AB%E6%B8%85',
'ceshi3.com': '000',
'_tp': 'B7r2tu8GDnnEKA6Eq%2FVibA%3D%3D',
'_pst': 'jd_zwIdfRmYOAep',
'3AB9D23F7A4B3C9B': '2NJPJXNWFKZ3XKKBVI7UYTJ7DJTTVMG7KWXCA4BCXYP4JMNCMCX7VGQPR2TI7PTAUHSBL4QXPEIIPHDNQZEZKJF2CY',
'__jda': '181111935.1764370966.1687070912.1728964615.1731824925.16',
'__jdc': '181111935',
'flash': '3_gGrStCyXUZZXKQ_mtYvc2OATndLdZHQZE6CeeogF8O48erkq84a1laOCcz6-13XcruhrgKiIrnmAvd7_qsnvYYaMeX1QFqwtuTz551mb-_86XyE57jCU6pPrXVv_tfWHOzjnS9KpY-L401dAdNMFMbxdZ71QPaDmjEexdUb7pX31esDG0fZn',
'ipLoc-djd': '12-978-980-36485',
'token': '12caf6e9fecb3f5cc3ae0ae1425a99cd,3,962124',
'__jdb': '181111935.6.1764370966|16.1731824925',
'shshshfpb': 'BApXSbNHYO_ZAdaYLJ0d7e3y8xkX0hGvPB8VGDjtq9xJ1MgeCSYO2',
}
HEADERS = {
'accept': 'application/json, text/javascript, */*; q=0.01',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0',
}
def get_filename(product_id, page):
"""生成带有商品 ID 和页数的唯一文件名"""
timestamp = time.strftime("%Y%m%d_%H%M%S")
return f"data_{product_id}_page{page}_{timestamp}.csv"
def init_csv(filename):
"""初始化 CSV 文件,写入表头"""
with open(filename, 'w', encoding='utf-8', newline='') as f:
csvwriter = csv.writer(f)
csvwriter.writerow(['comment_id', 'nick_name', 'location', 'creation_time', 'score', 'content'])
def spider(page, product_id):
"""爬取京东商品评论"""
params = {
'appid': 'item-v3',
'functionId': 'pc_club_productPageComments',
'client': 'pc',
'clientVersion': '1.0.0',
't': str(int(time.time() * 1000)), # 时间戳,防止缓存
'productId': product_id,
'score': '0',
'sortType': '5',
'page': page,
'pageSize': '10',
'isShadowSku': '0',
'fold': '1',
}
try:
response = requests.get('https://api.m.jd.com/', params=params, cookies=COOKIES, headers=HEADERS, timeout=10)
response.raise_for_status() # 如果请求失败,抛出异常
data = response.json()
comments = data.get('comments', [])
if not comments:
print(f"[INFO] 第 {page} 页无评论数据,可能是商品 ID 错误或评论已爬取完毕。")
return False
filename = get_filename(product_id, page) # 每页单独保存
init_csv(filename) # 初始化 CSV 文件
with open(filename, 'a', encoding='utf-8', newline='') as f:
csvwriter = csv.writer(f)
for comment in comments:
comment_id = comment.get('id', 'N/A')
nick_name = comment.get('nickname', '匿名用户')
location = comment.get('location', '未知')
creation_time = comment.get('creationTime', 'N/A')
score = comment.get('score', 0)
content = comment.get('content', '').replace('\n', '')
csvwriter.writerow([comment_id, nick_name, location, creation_time, score, content])
print(comment_id, nick_name, location, creation_time, content) # 打印每条评论数据
print(f"[INFO] 第 {page} 页数据已保存到 {filename}")
return True
except requests.exceptions.RequestException as e:
print(f"[ERROR] 请求失败: {e}")
except Exception as e:
print(f"[ERROR] 解析数据时出错: {e}")
return False
def main():
"""主函数"""
product_id = input('请输入商品的 ID:').strip()
try:
max_page = int(input('请输入要爬取的页数:').strip())
if max_page <= 0:
print("[ERROR] 页数必须大于 0")
return
except ValueError:
print("[ERROR] 请输入有效的整数页数")
return
for page in tqdm.tqdm(range(1, max_page + 1), desc="爬取进度", unit="页"):
success = spider(page, product_id)
if not success:
break # 如果当前页无数据,停止爬取
time.sleep(3) # 避免请求过快被封 IP
print("[INFO] 爬取完成!")
# 100120884199
if __name__ == "__main__":
main()
请输入商品的 ID:100120884199 请输入要爬取的页数:10
爬取进度: 0%| | 0/10 [00:00<?, ?页/s]
[ERROR] 请求失败: 403 Client Error: for url: https://api.m.jd.com/?appid=item-v3&functionId=pc_club_productPageComments&client=pc&clientVersion=1.0.0&t=1743051883083&productId=100120884199&score=0&sortType=5&page=1&pageSize=10&isShadowSku=0&fold=1 [INFO] 爬取完成!
In [ ]: