zoukankan      html  css  js  c++  java
  • 通过pyppeteer 库获取请求的携带的相关参数

    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    #--author: Baozi
    
    import asyncio
    from pyppeteer import launch
    import time
    import re
    
    
    url_params = ''
    doc_id = ''
    async def intercept_response(res):
        global url_params
        if '__dyn' in res.url and 'https://www.facebook.com/ajax/bz' in res.url and not url_params:
            url_params = res.url
            print(url_params)
    
    
    async def request_check(req):
        '''请求过滤'''
        if req.resourceType in ['image', 'media', 'eventsource', 'websocket']:
            await req.abort()
        else:
            await req.continue_()
    
    async def main(url,proxy,ua):
        browser = await launch({'headless': False, 'args': [ '--proxy-server={}'.format(proxy),'--disable-infobars'] })  # 启动pyppeteer 属于内存中实现交互的模拟器
        time.sleep(10)
        page = await browser.newPage()
        page.on('request', intercept_response)
    
        # 设置请求头userAgent
        await page.setUserAgent(ua)
        await page.goto(url,{'timeout': 1000*20})
        global url_params
        for i in range(3):
            if not url_params:
                time.sleep(10)
                await page.goto(url, {'timeout': 1000 * 20})
    
        comment_click = await page.xpath('//form[@rel="async"]//div[@class="_4vn1"]/span[@class="_4vn2"]/a')
        await comment_click[0].click()
        time.sleep(2.5)
        await browser.close()
    
    def get_url(url,proxy,user_agent):
    
        global url_params
        pyputeer_params = {}
        try:
            asyncio.get_event_loop().run_until_complete(main(url,proxy,user_agent))
        except Exception as e:
            pass
        url_params = url_params + '&'
        pyputeer_params['__user'] = '0'
        pyputeer_params['__a'] = '1'
        pyputeer_params['__dyn'] = re.findall('__dyn=(.*?)&', url_params)[0]
        pyputeer_params['__csr'] = re.findall('__csr=(.*?)&', url_params)[0]
        pyputeer_params['__req'] = re.findall('__req=(.*?)&', url_params)[0]
        pyputeer_params['__beoa'] = re.findall('__beoa=(.*?)&', url_params)[0]
        pyputeer_params['__pc'] = re.findall('__pc=(.*?)&', url_params)[0]
        pyputeer_params['dpr'] = re.findall('dpr=(.*?)&', url_params)[0]
        pyputeer_params['__ccg'] = re.findall('dpr=(.*?)&', url_params)[0]
        pyputeer_params['__rev'] = re.findall('__rev=(.*?)&', url_params)[0]
        pyputeer_params['__s'] = re.findall('__s=(.*?)&', url_params)[0]
        pyputeer_params['__hsi'] = re.findall('__hsi=(.*?)&', url_params)[0]
        pyputeer_params['__comet_req'] = re.findall('__comet_req=(.*?)&', url_params)[0]
        pyputeer_params['lsd'] = re.findall('lsd=(.*?)&', url_params)[0]
        pyputeer_params['jazoest'] = re.findall('jazoest=(.*?)&', url_params)[0]
        pyputeer_params['__spin_r'] = re.findall('__spin_r=(.*?)&', url_params)[0]
        pyputeer_params['__spin_b'] = re.findall('__spin_b=(.*?)&', url_params)[0]
        pyputeer_params['__spin_t'] = re.findall('__spin_t=(.*?)&', url_params)[0]
    
        return pyputeer_params
    
    if __name__ == '__main__':
    
        url = 'https://www.facebook.com/news.hkcd/posts/2966706433454938'
        proxy = 'http://172.16.7.14:13512'
        user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3965.0 Safari/537.36'
        print(get_url(url,proxy,user_agent))
    做一枚奔跑的老少年!
  • 相关阅读:
    MYSQL性能优化的最佳20+条经验
    MySQL性能分析工具之PROFILE
    理解事务的4种隔离级别
    二进制中1的个数
    滑动窗口最大值
    字符流中第一个不重复字符
    字符串转化为整数
    java字符,字符串,数字之间的转换
    java中数组输出的方式
    java基础知识(1)
  • 原文地址:https://www.cnblogs.com/xiaoshayu520ly/p/13543210.html
Copyright © 2011-2022 走看看