京东评论数据爬取

import requests
import random
import csv
import os
import time
from lxml import etree

class Jdcomment_spider(object):

    def __init__(self):
        self.headers = {
            'Referer': 'https://item.jd.com/100025376433.html',
            'cookie': 'shshshfpa=9a567d8b-f139-4a32-374d-6c54b5b07996-1694416231; shshshfpx=9a567d8b-f139-4a32-374d-6c54b5b07996-1694416231; __jdu=1702965842996819412142; pinId=AY2RaJhomU6bHnbw_xM_7bV9-x-f3wj7; pin=jd_49cfcbe90cd4e; unick=u_7qpk5bkq5dnp; _tp=cQffwlfVg21JbvdgknTqmcOGLf9a7tOIYuyJI6Gm2wI%3D; _pst=jd_49cfcbe90cd4e; qrsc=3; areaId=1; ipLoc-djd=1-72-55653-0; TrackID=1bBDgkbKSwNU90k4yc33FGRlOdeW4st4ecfmB1Kc0_0z6qgsvJbUF32Q5Y151kJ7xN37qUG0LhYNF3TailIFRbX3eaOY7fdYfoSxfyOqpoMs; thor=B32F5AECC0EC37806380E3A374C1A4A7644B3190523E395B9AB84D2AA36E02CDFF0CC99E07695FFB8CFDE8909E32CE786D91641CC22F1823E6E85E2E972D45FC9075ADCACF10FF6C5F3E9750D79B909D94E44FF6B0622BE0DAB87795EC02079C6F6A3858F866EC6BC8554E3AB92094D81C52DD047624368955B1F32D68DAAF0CCC6934888AA92635D32C284B9DA3CDC1EBC699DF39055ED84EFBA2C4260E3454; flash=2_89iYB18lttS-T8AiBSZA9h17BJJs3tvGDNNvIpTeWQcmcgoq4DZlreyjrwyR5biH1xcmY2dVSZ1t7bKZK4_64zR2KFoNjKarPAORFPnN92s*; PCSYCityID=JP_0_0_0; rkv=1.0; unpl=JF8EAMhnNSttCkxQVR1VHBtCSVxTWw4IGB9Ub25WAw1fGFFWGFYSGkJ7XlVdWBRKFR9sYRRXXFNKXQ4bBCsSEXteXVdZDEsWC2tXVgQFDQ8VXURJQlZAFDNVCV9dSRZRZjJWBFtdT1xWSAYYRRMfDlAKDlhCR1FpMjVkXlh7VAQrAh0TE0NaUFdYC0MfBGZnDVFdW0tRBisDKxUge21QXlsAQxMzblcEZB8MF1EDGwcYG11LW1VdVQ9PHgZsbw1TVFhDUQUYAh4RIEptVw; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_c65a7f68c3064b1b9f18bfa6b4b2e89c|1711541301147; jsavif=1; jsavif=1; user-key=ad547ec3-468c-4948-977a-61e9d9803757; cn=3; avif=1; __jda=143920055.1702965842996819412142.1702965843.1711533298.1711541301.48; __jdc=143920055; 3AB9D23F7A4B3CSS=jdd03MIG54FFPWMNFIIKVMM4TWZQCSRRQET7IMAUH2OLKPTSBNWTZHZQ36VCWBZLSENKRBTOXZ6JJ5VAFMIPYTD7LUWDXCEAAAAMOP7MJ7CYAAAAAC6PI2I3D4J5XAEX; xapieid=jdd03MIG54FFPWMNFIIKVMM4TWZQCSRRQET7IMAUH2OLKPTSBNWTZHZQ36VCWBZLSENKRBTOXZ6JJ5VAFMIPYTD7LUWDXCEAAAAMOP7MJ7CYAAAAAC6PI2I3D4J5XAEX; __jdb=143920055.10.1702965842996819412142|48.1711541301; shshshfpb=BApXeovnSfOtAoNObA36khoWEBSMOdshMB9YAZR5X9xJ1MuqIrYO2; 3AB9D23F7A4B3C9B=MIG54FFPWMNFIIKVMM4TWZQCSRRQET7IMAUH2OLKPTSBNWTZHZQ36VCWBZLSENKRBTOXZ6JJ5VAFMIPYTD7LUWDXCE',
            'User-Agent': self.rand_ua()
        }
        self.data= list()
        self.root_path = os.path.dirname(os.path.abspath(__file__))
        self.priduct_id = ['100025376433']
    # 随机生成用户代理(User-Agent)头部信息
    def rand_ua(self):
        user_agents = [
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.8 (KHTML like Gecko) Chrome/16.0.912.63 Safari/535.8",
            "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML like Gecko) Chrome/6.0.460.0 Safari/534.3",
            "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML like Gecko) Chrome/6.0.463.0 Safari/534.3",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.9 (KHTML like Gecko) Chrome/2.0.157.0 Safari/528.9",
            "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML like Gecko) Chrome/14.0.794.0 Safari/535.1",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML like Gecko) Chrome/11.0.694.0 Safari/534.24",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML like Gecko) Chrome/14.0.810.0 Safari/535.1",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML like Gecko) Chrome/4.0.211.0 Safari/532.0",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.6 (KHTML like Gecko) Chrome/7.0.500.0 Safari/534.6",
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; TencentTraveler)",
            "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1",
            "Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.7.5) Gecko/20041107 Firefox/1.0",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML like Gecko) Chrome/20.0.1092.0 Safari/536.6",
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; Maxthon; .NET CLR 1.1.4322)",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML like Gecko) Chrome/4.0.223.4 Safari/532.2",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML like Gecko) Chrome/17.0.963.65 Safari/535.11",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML like Gecko) Chrome/13.0.782.41 Safari/535.1",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.21 (KHTML like Gecko) Chrome/11.0.682.0 Safari/534.21",
            "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.0 (KHTML like Gecko) Chrome/2.0.182.0 Safari/531.0",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.9 (KHTML like Gecko) Chrome/7.0.531.0 Safari/534.9",
            "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)",
        ]
        return random.choice(user_agents).strip()

    def getHtml(self, url, params=None):
        retry_count = 5
        while retry_count > 0:
            try:
                start_time = time.time()  # 记录请求开始时间
                html = requests.get(url=url, headers=self.headers, params=params)
                end_time = time.time()  # 记录请求结束时间

                # 更新响应时间
                self.last_response_time = end_time - start_time

                return html
            except Exception:
                retry_count -= 1
        return None

    def parse_one_page(self, url, product_id, page, scores):
        params = {
            'appid': 'item-v3',
            'functionId': 'pc_club_productPageComments',
            'client': 'pc',
            'clientVersion': '1.0.0',
            't': int(time.time()) * 1000,
            'loginType': '3',
            'uuid': '181111935.17090388130361557545027.1709038813.1709043420.1709101096.3',
            'productId': product_id,
            'score': scores,
            'sortType': '5',
            'page': page,
            'pageSize': '10',
            'isShadowSku': '0',
            'rid': '0',
            'fold': '1',
            'bbtf': '',
            'shield': '',
        }
        response = self.getHtml(url, params=params)
        if response:
            js_data = response.json()
            comments_list = js_data.get('comments', [])
            for comment in comments_list:
                goods_id = comment.get('id')
                nickname = comment.get('nickname')
                score = comment.get('score')
                productSize = comment.get('productSize', '')
                productColor = comment.get('productColor', '')
                creationTime = comment.get('creationTime')
                content = comment.get('content', '')
                content = ','.join(content.split('\n'))
                self.data.append([nickname, creationTime, productSize + productColor, content, score])
        else:
            print(f"Failed to get response from {url}")

    def parse_max_page(self, product_id, scores):
        for page_num in range(0, 101):
            new_url = f'https://api.m.jd.com/'
            print(f'正在获取第{page_num}页')
            self.parse_one_page(url=new_url, product_id=product_id, page=page_num, scores=scores)
            self.dynamic_sleep()

    def dynamic_sleep(self):
        # 根据上次响应时间动态调整睡眠时间
        min_sleep = 1
        max_sleep = 6

        # 计算睡眠时间
        sleep_time = max(min_sleep, min(max_sleep, self.last_response_time))

        # 添加一些随机性,以避免精确的固定间隔
        sleep_time += random.uniform(0.5, 1.5)

        # 执行睡眠
        print(f"Sleeping for {sleep_time} seconds...")
        time.sleep(sleep_time)

    def write_csv_file(self, file_name='jd_comment.csv'):
        head = ["用户名", "时间", "产品", "评论", "评分"]
        folder_path = os.path.join(self.root_path, 'data')
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
        filepath = os.path.join(folder_path, f'{file_name}.csv')
        try:
            with open(filepath, 'a', newline='', encoding='utf_8_sig') as csv_file:
                writer = csv.writer(csv_file, dialect='excel')
                if head is not None:
                    writer.writerow(head)
                for item in self.data:
                    writer.writerow(item)
                print("Write a CSV file to path %s Successful." % filepath)
        except Exception as e:
            print("Fail to write CSV to path: %s, Case: %s" % (filepath, e))

if __name__ == '__main__':
    jd_spider = Jdcomment_spider()
    scores = "3"  # 好评是 3  中评是2  差评是 1  填相对应的评论参数
    for product_id in jd_spider.priduct_id:  # Correct the variable name to product_id
        jd_spider.data = list()
        filename = product_id + '-' + scores
        jd_spider.parse_max_page(product_id, scores)
        jd_spider.write_csv_file(filename)
        print(filename + '====爬取完成')
    print('全部爬取完成')

 

版权声明:
作者:夜阑
链接:http://yelan.xyz/index.php/2024/04/11/%e4%ba%ac%e4%b8%9c%e8%af%84%e8%ae%ba%e6%95%b0%e6%8d%ae%e7%88%ac%e5%8f%96/
来源:夜阑的小站
文章版权归作者所有,未经允许请勿转载。

THE END
分享
二维码
< <上一篇
下一篇>>