zoukankan      html  css  js  c++  java
  • 开线程爬取黑猫里的阿里投诉信息

    仅供学习,请适度开线程

    一.代码

    import requests
    from requests_html import HTMLSession
    import time
    from concurrent.futures import ThreadPoolExecutor
    import json
    
    pool = ThreadPoolExecutor(30)
    big_list = []
    pool_name_list =[]
    session = HTMLSession()
    
    def dewu_company(x):
    
        try:
    
            print(f'第{x+1}页')
    
            params = {
                'couid': '1878960481',
                'type': '1',
                'page_size': f'{(x + 1) * 10}',
                'page': f'{x + 1}',
                # 'callback':'jQuery11',
            }
            url = 'https://tousu.sina.com.cn/api/company/received_complaints'
            res = requests.get(url, params=params, verify=False)
            info_list = res.json()['result']['data']['complaints']
            for dict_info in info_list:
                dict_info['main']['url'] = 'https:' + dict_info['main']['url']
                dict_info['author']['avatar'] = 'https:' + dict_info['author']['avatar']
                info_url = dict_info['main']['url']
                print(info_url)
                res = session.get(info_url, verify=False)
                new_dict = dict()
                new_dict['投诉编号'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[1]/text()')[0]
                new_dict['投诉对象'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[2]/a/text()')[0]
                new_dict['投诉问题'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[3]/text()')[0]
                new_dict['投诉要求'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[4]/text()')[0]
                new_dict['涉诉金额'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[5]/text()')[0]
                new_dict['投诉进度'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[6]/b/text()')[0]
                # new_dict['a'] = res_dome.xpath('//*[@class="u-name"]/text()')
                # new_dict['b'] = res_dome.xpath('//*[@class="u-status"]/text()')
                new_dict['投诉进程详情'] = res.html.xpath('//*[@class="ts-d-steplist"]')[0].text
                not_have_http_img_list = res.html.xpath('//*[@class="example-image-link"]/@href')
                have_http_img_list = []
                for a in not_have_http_img_list:
                    have_http_img_list.append('https:' + a)
                new_dict['投诉图片'] = have_http_img_list
    
                vide_id_list = res.html.xpath('//*[@class="video-ico"]/@data-id')
                print(vide_id_list)
                new_vide_list = []
                if vide_id_list:
                    for vide_id in vide_id_list:
                        t = int(time.time())
                        vide_info_url = f'https://api.ivideo.sina.com.cn/public/video/play?video_id={vide_id}&appver=V11220.191231.02&appname=sinaplayer_pc&applt=web&tags=sinaplayer_pc&player=all&jsonp=&plid=2019090301&prid=&uid=&tid=&pid=1&ran=0.34558379845592846&r=https%3A%2F%2Ftousu.sina.com.cn%2Fcomplaint%2Fview%2F17349160365%2F&referrer=&ssid=gusr_pc_{t}&preload=0&uu=60.180.189.200_1581579174.788519&isAuto=1'
                        res = session.get(vide_info_url, verify=False)
                        try:
                            new_vide_list.append(res.json())
                        except:
                            pass
                new_dict['投诉视频详情'] = new_vide_list
                dict_info['投诉详情'] = new_dict
                big_list.append(dict_info)
        except:
            print('错误跳过这一页')
    
    def run(page):
        '''爬取的页面数量'''
        for x in range(page):
            name = pool.submit(dewu_company,x)
            pool_name_list.append(name)
        for name_1 in pool_name_list:
            name_1.result()
        print('全部结束开始保存本地')
        with open(f'阿里投诉信息.json', "w", encoding='utf8') as fw:
            json.dump(big_list, fw)
        print('保存完毕')
    
    if __name__ == '__main__':
        run(1)
    
    
  • 相关阅读:
    C#连接手机安装软件和发送信息
    asp.net 简单分页打印
    asp.net 下载的几种方式
    js 刷新后不提示并保留控件状态
    JAVA 基础编程练习题2 【程序 2 输出素数】
    JAVA 基础编程练习题1 【程序 1 不死神兔】
    setMaxActive和setMaxWait方法
    java.lang.UnsupportedClassVersionError: com/mysql/jdbc/Driver : Unsupported major.minor version 52.0
    java.lang.RuntimeException: org.dom4j.DocumentException: 1 字节的 UTF-8 序列的字节 1 无效。
    HTML DOM firstChild lastChild nextSibling previousSibling 属性_获取属性值的undefined问题
  • 原文地址:https://www.cnblogs.com/pythonywy/p/12545614.html
Copyright © 2011-2022 走看看