京东评论数据爬取
import requests
import random
import csv
import os
import time
from lxml import etree
class Jdcomment_spider(object):
def __init__(self):
self.headers = {
'Referer': 'https://item.jd.com/100025376433.html',
'cookie': 'shshshfpa=9a567d8b-f139-4a32-374d-6c54b5b07996-1694416231; shshshfpx=9a567d8b-f139-4a32-374d-6c54b5b07996-1694416231; __jdu=1702965842996819412142; pinId=AY2RaJhomU6bHnbw_xM_7bV9-x-f3wj7; pin=jd_49cfcbe90cd4e; unick=u_7qpk5bkq5dnp; _tp=cQffwlfVg21JbvdgknTqmcOGLf9a7tOIYuyJI6Gm2wI%3D; _pst=jd_49cfcbe90cd4e; qrsc=3; areaId=1; ipLoc-djd=1-72-55653-0; TrackID=1bBDgkbKSwNU90k4yc33FGRlOdeW4st4ecfmB1Kc0_0z6qgsvJbUF32Q5Y151kJ7xN37qUG0LhYNF3TailIFRbX3eaOY7fdYfoSxfyOqpoMs; thor=B32F5AECC0EC37806380E3A374C1A4A7644B3190523E395B9AB84D2AA36E02CDFF0CC99E07695FFB8CFDE8909E32CE786D91641CC22F1823E6E85E2E972D45FC9075ADCACF10FF6C5F3E9750D79B909D94E44FF6B0622BE0DAB87795EC02079C6F6A3858F866EC6BC8554E3AB92094D81C52DD047624368955B1F32D68DAAF0CCC6934888AA92635D32C284B9DA3CDC1EBC699DF39055ED84EFBA2C4260E3454; flash=2_89iYB18lttS-T8AiBSZA9h17BJJs3tvGDNNvIpTeWQcmcgoq4DZlreyjrwyR5biH1xcmY2dVSZ1t7bKZK4_64zR2KFoNjKarPAORFPnN92s*; PCSYCityID=JP_0_0_0; rkv=1.0; unpl=JF8EAMhnNSttCkxQVR1VHBtCSVxTWw4IGB9Ub25WAw1fGFFWGFYSGkJ7XlVdWBRKFR9sYRRXXFNKXQ4bBCsSEXteXVdZDEsWC2tXVgQFDQ8VXURJQlZAFDNVCV9dSRZRZjJWBFtdT1xWSAYYRRMfDlAKDlhCR1FpMjVkXlh7VAQrAh0TE0NaUFdYC0MfBGZnDVFdW0tRBisDKxUge21QXlsAQxMzblcEZB8MF1EDGwcYG11LW1VdVQ9PHgZsbw1TVFhDUQUYAh4RIEptVw; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_c65a7f68c3064b1b9f18bfa6b4b2e89c|1711541301147; jsavif=1; jsavif=1; user-key=ad547ec3-468c-4948-977a-61e9d9803757; cn=3; avif=1; __jda=143920055.1702965842996819412142.1702965843.1711533298.1711541301.48; __jdc=143920055; 3AB9D23F7A4B3CSS=jdd03MIG54FFPWMNFIIKVMM4TWZQCSRRQET7IMAUH2OLKPTSBNWTZHZQ36VCWBZLSENKRBTOXZ6JJ5VAFMIPYTD7LUWDXCEAAAAMOP7MJ7CYAAAAAC6PI2I3D4J5XAEX; xapieid=jdd03MIG54FFPWMNFIIKVMM4TWZQCSRRQET7IMAUH2OLKPTSBNWTZHZQ36VCWBZLSENKRBTOXZ6JJ5VAFMIPYTD7LUWDXCEAAAAMOP7MJ7CYAAAAAC6PI2I3D4J5XAEX; __jdb=143920055.10.1702965842996819412142|48.1711541301; shshshfpb=BApXeovnSfOtAoNObA36khoWEBSMOdshMB9YAZR5X9xJ1MuqIrYO2; 3AB9D23F7A4B3C9B=MIG54FFPWMNFIIKVMM4TWZQCSRRQET7IMAUH2OLKPTSBNWTZHZQ36VCWBZLSENKRBTOXZ6JJ5VAFMIPYTD7LUWDXCE',
'User-Agent': self.rand_ua()
}
self.data= list()
self.root_path = os.path.dirname(os.path.abspath(__file__))
self.priduct_id = ['100025376433']
# 随机生成用户代理(User-Agent)头部信息
def rand_ua(self):
user_agents = [
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.8 (KHTML like Gecko) Chrome/16.0.912.63 Safari/535.8",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML like Gecko) Chrome/6.0.460.0 Safari/534.3",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML like Gecko) Chrome/6.0.463.0 Safari/534.3",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.9 (KHTML like Gecko) Chrome/2.0.157.0 Safari/528.9",
"Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML like Gecko) Chrome/14.0.794.0 Safari/535.1",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML like Gecko) Chrome/11.0.694.0 Safari/534.24",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML like Gecko) Chrome/14.0.810.0 Safari/535.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML like Gecko) Chrome/4.0.211.0 Safari/532.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.6 (KHTML like Gecko) Chrome/7.0.500.0 Safari/534.6",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; TencentTraveler)",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1",
"Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Maxthon; .NET CLR 1.1.4322)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML like Gecko) Chrome/4.0.223.4 Safari/532.2",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML like Gecko) Chrome/17.0.963.65 Safari/535.11",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML like Gecko) Chrome/13.0.782.41 Safari/535.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML like Gecko) Chrome/11.0.682.0 Safari/534.21",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.0 (KHTML like Gecko) Chrome/2.0.182.0 Safari/531.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.9 (KHTML like Gecko) Chrome/7.0.531.0 Safari/534.9",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)",
]
return random.choice(user_agents).strip()
def getHtml(self, url, params=None):
retry_count = 5
while retry_count > 0:
try:
start_time = time.time() # 记录请求开始时间
html = requests.get(url=url, headers=self.headers, params=params)
end_time = time.time() # 记录请求结束时间
# 更新响应时间
self.last_response_time = end_time - start_time
return html
except Exception:
retry_count -= 1
return None
def parse_one_page(self, url, product_id, page, scores):
params = {
'appid': 'item-v3',
'functionId': 'pc_club_productPageComments',
'client': 'pc',
'clientVersion': '1.0.0',
't': int(time.time()) * 1000,
'loginType': '3',
'uuid': '181111935.17090388130361557545027.1709038813.1709043420.1709101096.3',
'productId': product_id,
'score': scores,
'sortType': '5',
'page': page,
'pageSize': '10',
'isShadowSku': '0',
'rid': '0',
'fold': '1',
'bbtf': '',
'shield': '',
}
response = self.getHtml(url, params=params)
if response:
js_data = response.json()
comments_list = js_data.get('comments', [])
for comment in comments_list:
goods_id = comment.get('id')
nickname = comment.get('nickname')
score = comment.get('score')
productSize = comment.get('productSize', '')
productColor = comment.get('productColor', '')
creationTime = comment.get('creationTime')
content = comment.get('content', '')
content = ','.join(content.split('\n'))
self.data.append([nickname, creationTime, productSize + productColor, content, score])
else:
print(f"Failed to get response from {url}")
def parse_max_page(self, product_id, scores):
for page_num in range(0, 101):
new_url = f'https://api.m.jd.com/'
print(f'正在获取第{page_num}页')
self.parse_one_page(url=new_url, product_id=product_id, page=page_num, scores=scores)
self.dynamic_sleep()
def dynamic_sleep(self):
# 根据上次响应时间动态调整睡眠时间
min_sleep = 1
max_sleep = 6
# 计算睡眠时间
sleep_time = max(min_sleep, min(max_sleep, self.last_response_time))
# 添加一些随机性,以避免精确的固定间隔
sleep_time += random.uniform(0.5, 1.5)
# 执行睡眠
print(f"Sleeping for {sleep_time} seconds...")
time.sleep(sleep_time)
def write_csv_file(self, file_name='jd_comment.csv'):
head = ["用户名", "时间", "产品", "评论", "评分"]
folder_path = os.path.join(self.root_path, 'data')
if not os.path.exists(folder_path):
os.makedirs(folder_path)
filepath = os.path.join(folder_path, f'{file_name}.csv')
try:
with open(filepath, 'a', newline='', encoding='utf_8_sig') as csv_file:
writer = csv.writer(csv_file, dialect='excel')
if head is not None:
writer.writerow(head)
for item in self.data:
writer.writerow(item)
print("Write a CSV file to path %s Successful." % filepath)
except Exception as e:
print("Fail to write CSV to path: %s, Case: %s" % (filepath, e))
if __name__ == '__main__':
jd_spider = Jdcomment_spider()
scores = "3" # 好评是 3 中评是2 差评是 1 填相对应的评论参数
for product_id in jd_spider.priduct_id: # Correct the variable name to product_id
jd_spider.data = list()
filename = product_id + '-' + scores
jd_spider.parse_max_page(product_id, scores)
jd_spider.write_csv_file(filename)
print(filename + '====爬取完成')
print('全部爬取完成')
版权声明:
作者:夜阑
链接:http://yelan.xyz/index.php/2024/04/11/%e4%ba%ac%e4%b8%9c%e8%af%84%e8%ae%ba%e6%95%b0%e6%8d%ae%e7%88%ac%e5%8f%96/
来源:夜阑的小站
文章版权归作者所有,未经允许请勿转载。
THE END
二维码
共有 0 条评论