zoukankan      html  css  js  c++  java
  • python3多线程爬取京东投诉信息

    开启线程池示例

    import time
    import threading
    from concurrent.futures import ThreadPoolExecutor
    
    
    pool =ThreadPoolExecutor(100)
    spider_list = []
    
    #爬虫方法
    #page/url  代表爬取第几页或爬取第几个详情url
    #def func1(url):
    def func1(page):
        print("a",page)
    
    pages = 50 #urls=50 # 参数可以是:列表页数/商品列表urls
    for page in range(pages):
        # 正在运行的线程id
        # print('running thread id : %d   now=%d' % (threading.get_ident(), url))
        print('running thread id : %d   now=%d' % (threading.get_ident(), page))
        # 将列表页数或商品列表url提交到函数方法
        # str_url = pool.submit(func1, url)
        str_page = pool.submit(func1,page)
        # 完成抓取数据列表舔加到spider
        # spider_list.append(str_url)
        spider_list.append(str_page)
        print("spider_list=",spider_list)
    
    for list in spider_list:
        # 完成的结果
        list.result()
    print('线程全部执行完毕')
    

    一、多线程爬取京东投诉信息

    #!/usr/bin/env python
    # -*- coding=utf-8 -*-
    
    import json
    import threading
    import time
    from requests_html import HTMLSession
    from concurrent.futures import ThreadPoolExecutor
    import warnings
    warnings.filterwarnings("ignore")
    
    session = HTMLSession()
    proxies =None
    
    # 线程池
    pool = ThreadPoolExecutor(30)
    big_list = []
    pool_list = []
    
    def dewu_company(pages):
        # 爬取第几页
        print("第"+str(pages)+"页")
        t=str(int(time.time()*1000))
    
        url = "https://tousu.sina.com.cn/api/company/received_complaints"
        headers ={
            "authority": "tousu.sina.com.cn",
            "method": "GET",
            "path": "/api/company/received_complaints?callback=jQuery11120045959640946885205_1584672560291&couid=7046706808&type=1&page_size=10&page=4&_=1584672560295",
            "scheme": "https",
            "accept": "text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01",
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
            # "cookie": "SINAGLOBAL=183.192.8.132_1582353209.520903; UOR=www.baidu.com,tech.sina.com.cn,; __gads=ID=f98d68da093a23f1:T=1582381171:S=ALNI_MaF2_0MnhLJR0sTx2rAnjRyQXnm7w; UM_distinctid=1706d4483c137a-085e50f1dd69ca-37c143e-144000-1706d4483c23fd; lxlrttp=1578733570; U_TRS1=00000017.83e666c5.5e5a9b19.a73e93a1; Apache=58.246.234.18_1584426545.621701; U_TRS2=00000012.37245179.5e706e32.c9bedf3c; TOUSU-SINA-CN=; ULV=1584600854277:2:1:1:58.246.234.18_1584426545.621701:1582381166398; ULOGIN_IMG=tc-6e5332f21698ea872b48ddbbb7971af59a85; CNZZDATA1273941306=1828399964-1584599137-%7C1584669947",
            "referer": "https://tousu.sina.com.cn/company/view/?couid=7046706808",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36",
            "x-requested-with": "XMLHttpRequest",
        }
    
        params = {
            # "callback":"jQuery11120045959640946885205_1584672560291",
            "couid":"5650743478",  # 1878960481 阿里  5650743478 京东 # 得物 7046706808
            "type":"1",
            "page_size":"10",
             "page":pages,
            "_":t,
        }
    
        res = session.get(url, params=params, headers=headers, proxies=proxies, verify=False,timeout=5)
        # print(res.text)
    
        info_list = res.json()["result"]["data"]["complaints"]
        for info in info_list:
            # title = info.get("title")
            # uid = info.get("uid")
            # summary = info.get("summary")
            info_url = 'https:' + info['main']['url']
            # print(info_url)
            return parse_detail(info_url, info)
    
    
    def parse_detail(info_url,info):
        #https://tousu.sina.com.cn/complaint/view/17349163730/
        try:
            res = session.get(info_url,  proxies=proxies, verify=False)
            # print(res.text)
            new_dict = dict()
            new_dict['投诉编号'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[1]/text()')[0]
            new_dict['投诉对象'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[2]/a//text()')[0]
            new_dict['投诉问题'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[3]/text()')[0]
            new_dict['投诉要求'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[4]/text()')[0]
            new_dict['涉诉金额'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[5]/text()')[0]
            new_dict['投诉进度'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[6]/b/text()')[0]
            new_dict['投诉进程详情'] = res.html.xpath('//*[@class="ts-d-steplist"]')[0].text
            # 获取投诉图片
            img_info_list=[]
            img_url = res.html.xpath('//*[@class="example-image-link"]/@href')
            for url in img_url:
                img_info_list.append("https:"+url)
                new_dict['投诉图片'] = img_info_list
    
    
            # 获取视频列表
            vide_id_list = res.html.xpath('//*[@class="video-ico"]/@data-id')
            if len(vide_id_list)>=1:
                # 投诉视频详情
                new_vide_list = []
                if vide_id_list:
                    for vide_id in vide_id_list:
                        t = int(time.time())
                        vide_info_url = f'https://api.ivideo.sina.com.cn/public/video/play?video_id={vide_id}&appver=V11220.191231.02&appname=sinaplayer_pc&applt=web&tags=sinaplayer_pc&player=all&jsonp=&plid=2019090301&prid=&uid=&tid=&pid=1&ran=0.34558379845592846&r=https%3A%2F%2Ftousu.sina.com.cn%2Fcomplaint%2Fview%2F17349160365%2F&referrer=&ssid=gusr_pc_{t}&preload=0&uu=60.180.189.200_1581579174.788519&isAuto=1'
                        # print("vide_info_url=" ,vide_info_url)
                        res = session.get(vide_info_url, verify=False)
                        # result = res.encode('utf-8').decode('unicode_escape')
                        result = json.loads(res.text)
                        # print("result =",type(result))
                        new_vide_list.append(result)
    
                if new_vide_list:
                    new_dict['投诉视频详情'] = new_vide_list
                    info['投诉详情'] = new_dict
                # else:
                #     new_dict['投诉视频详情'] = None
                #     info['投诉详情'] = new_dict
    
            big_list.append(new_dict)
            print("big_list==",big_list)
    
        except Exception as e:
            print(e)
    
        # 写入json 文件
        with open('京东投诉信息.json', "a+", encoding = 'utf-8') as fw:
            fw.write(json.dumps(big_list,ensure_ascii=False ) + '
    ')
    
    
    def main(pages):
        startTime = time.time()
        # 爬取页数
        for page in range(pages):
            name = pool.submit(dewu_company,page)
            pool_list.append(name)
        for n in pool_list:
            n.result()
        print("全部结束并保存本地")
    
        # 以下写入json文件不能换行,那位大神可以指点下
        # with open('京东投诉信息.json', "a+", encoding = 'utf-8') as fw:
        #     fw.write(json.dumps(big_list,ensure_ascii=False ) + '
    ')
        endTime = time.time()
        print('Done, Time cost: %s ' % (endTime - startTime))
    
    if __name__ == '__main__':
        # 输入爬取页数
        main(20)
    
    

    20页数据爬取时间:Done, Time cost: 1.6854908466339111

    二、多线程爬取阿里详情投诉信息

    #!/usr/bin/env python
    # -*- coding=utf-8 -*-
    
    
    import json
    import threading
    import time
    from requests_html import HTMLSession
    from concurrent.futures import ThreadPoolExecutor
    import warnings
    warnings.filterwarnings("ignore")
    
    session = HTMLSession()
    proxies =None
    
    
    def dewu_company():
        for page in range(1,20):
            print("第"+str(page)+"页")
            t=str(int(time.time()*1000))
    
            url = "https://tousu.sina.com.cn/api/company/received_complaints"
            headers ={
                "authority": "tousu.sina.com.cn",
                "method": "GET",
                "path": "/api/company/received_complaints?callback=jQuery11120045959640946885205_1584672560291&couid=7046706808&type=1&page_size=10&page=4&_=1584672560295",
                "scheme": "https",
                "accept": "text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01",
                "accept-encoding": "gzip, deflate, br",
                "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
                # "cookie": "SINAGLOBAL=183.192.8.132_1582353209.520903; UOR=www.baidu.com,tech.sina.com.cn,; __gads=ID=f98d68da093a23f1:T=1582381171:S=ALNI_MaF2_0MnhLJR0sTx2rAnjRyQXnm7w; UM_distinctid=1706d4483c137a-085e50f1dd69ca-37c143e-144000-1706d4483c23fd; lxlrttp=1578733570; U_TRS1=00000017.83e666c5.5e5a9b19.a73e93a1; Apache=58.246.234.18_1584426545.621701; U_TRS2=00000012.37245179.5e706e32.c9bedf3c; TOUSU-SINA-CN=; ULV=1584600854277:2:1:1:58.246.234.18_1584426545.621701:1582381166398; ULOGIN_IMG=tc-6e5332f21698ea872b48ddbbb7971af59a85; CNZZDATA1273941306=1828399964-1584599137-%7C1584669947",
                "referer": "https://tousu.sina.com.cn/company/view/?couid=7046706808",
                "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36",
                "x-requested-with": "XMLHttpRequest",
            }
    
            params = {
                # "callback":"jQuery11120045959640946885205_1584672560291",
                "couid":"1878960481",  # 1878960481 阿里  5650743478 京东 # 得物 7046706808
                "type":"1",
                "page_size":"10",
                 "page":page,
                "_":t,
            }
    
            res = session.get(url, params=params, headers=headers, proxies=proxies, verify=False,timeout=5)
            # print(res.text)
    
            # 开启线程调用商品详情打开
            ths = []
    
            info_list = res.json()["result"]["data"]["complaints"]
            for info in info_list:
                # title = info.get("title")
                # uid = info.get("uid")
                # summary = info.get("summary")
                info_url = 'https:' + info['main']['url']
                # print(info_url)
    
                ## 开启线程调用商品详情
                th = threading.Thread(target=parse_detail,args=(info_url,info))
                th.start()
                ths.append(th)
                if len(ths) > 10:
                    for th_one in ths:
                        th_one.join()
                    ths = []
            for th_one in ths:
                th_one.join()
    
    
    def parse_detail(info_url,info):
    
        #https://tousu.sina.com.cn/complaint/view/17349163730/
        try:
            big_list = []
    
            res = session.get(info_url,  proxies=proxies, verify=False)
            # print(res.text)
            new_dict = dict()
            new_dict['投诉编号'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[1]/text()')[0]
            new_dict['投诉对象'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[2]/a//text()')[0]
            new_dict['投诉问题'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[3]/text()')[0]
            new_dict['投诉要求'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[4]/text()')[0]
            new_dict['涉诉金额'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[5]/text()')[0]
            new_dict['投诉进度'] = res.html.xpath('//div[@class="ts-d-question"]/ul/li[6]/b/text()')[0]
            new_dict['投诉进程详情'] = res.html.xpath('//*[@class="ts-d-steplist"]')[0].text
            # 获取投诉图片
            img_info_list=[]
            img_url = res.html.xpath('//*[@class="example-image-link"]/@href')
            for url in img_url:
                img_info_list.append("https:"+url)
                new_dict['投诉图片'] = img_info_list
    
    
            # 获取视频列表
            vide_id_list = res.html.xpath('//*[@class="video-ico"]/@data-id')
            if len(vide_id_list)>=1:
                # print("vide_id_list=",vide_id_list)
                # 投诉视频详情
                new_vide_list = []
                if vide_id_list:
                    for vide_id in vide_id_list:
                        t = int(time.time())
                        vide_info_url = f'https://api.ivideo.sina.com.cn/public/video/play?video_id={vide_id}&appver=V11220.191231.02&appname=sinaplayer_pc&applt=web&tags=sinaplayer_pc&player=all&jsonp=&plid=2019090301&prid=&uid=&tid=&pid=1&ran=0.34558379845592846&r=https%3A%2F%2Ftousu.sina.com.cn%2Fcomplaint%2Fview%2F17349160365%2F&referrer=&ssid=gusr_pc_{t}&preload=0&uu=60.180.189.200_1581579174.788519&isAuto=1'
                        # print("vide_info_url=" ,vide_info_url)
                        res = session.get(vide_info_url, verify=False)
                        # result = res.encode('utf-8').decode('unicode_escape')
                        result = json.loads(res.text)
                        # print("result =",type(result))
                        new_vide_list.append(result)
    
                if new_vide_list:
                    new_dict['投诉视频详情'] = new_vide_list
                    info['投诉详情'] = new_dict
                # else:
                #     new_dict['投诉视频详情'] = None
                #     info['投诉详情'] = new_dict
    
            big_list.append(new_dict)
            print("big_list==",big_list,len(big_list))
    
        except Exception as e:
            print(e)
    
        with open('阿里投诉信息.json', "a+", encoding = 'utf-8') as fw:
            fw.write(json.dumps(big_list,ensure_ascii=False ) + '
    ')
    
    
    
    if __name__ == '__main__':
        startTime = time.time()
        dewu_company()
        endTime = time.time()
        print ('Done, Time cost: %s ' % (endTime - startTime))
    
    

    20页数据爬取时间:Done, Time cost: 20.348562240600586

  • 相关阅读:
    03.八种数据类型
    07.条件与控制
    11.函数作用域及闭包
    04.深入数据类型
    201871010109胡欢欢《面向对象程序设计(java)》第十周学习总结 201871010109
    201871010109胡欢欢《面向对象程序设计(java)》第一周学习总结 201871010109
    201871010109胡欢欢《面向对象程序设计(java)》第二周学习总结 201871010109
    201871010109胡欢欢《面向对象程序设计(java)》第十一周学习总结 201871010109
    201871010109胡欢欢《面向对象程序设计(java)》第四周学习总结会 201871010109
    《2019面向对象程序设计(java)课程学习进度条》 201871010109
  • 原文地址:https://www.cnblogs.com/gqv2009/p/12548884.html
Copyright © 2011-2022 走看看