zoukankan      html  css  js  c++  java
  • 使用mitmproxy做今日头条爬虫链接分析

    import pickle
    
    import chardet
    from mitmproxy import ctx
    from pprint import pprint
    
    heads_file = 'header.txt'
    
    body_file = 'body.txt'
    
    #mitmdump -s test.py
    # Dalvik/2.1.0 (Linux; U; Android 8.1.0; MI 8 MIUI/8.8.31)
    def request(flow):
         #只是修改请求浏览器请求头为MitmProxy
         # flow.request.headers['User-Agent'] = 'Mozilla/5.0 (Linux; U; Android 6.0.1; zh-cn; MI 5s Build/MXB48T) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.146 Mobile Safari/537.36 XiaoMi/MiuiBrowser/8.7.1'
         # ctx.log.warn(str(flow.request.url))
         # ctx.log.info(str(flow.request.headers))
         # pprint(vars(flow.request))
         # ctx.log.error(str(dir(flow.request)))
         # ctx.log.info("data.content:" + str(flow.request.data.content))
         # ctx.log.info("data:" + str(dir(flow.request.data)))
         # ctx.log.info("content:" + str(flow.request.content))
         # ctx.log.info(flow.request.headers['User-Agent'])
         url = str(flow.request.url)
         ctx.log.info("url:" + url)
         # if 'pstatp.com/article' in url or 'snssdk.com/article' in url or 'snssdk.com/api/search' in url:
         #      file = open(heads_file, encoding="utf-8", mode="a")
         #      file.write( url + "
    ")
         #      file.close()
         fileother = open("other.txt", encoding="utf-8", mode="a")
         fileother.write(url + "
    ")
         fileother.close()
         # with open(heads_file, 'a') as handle:
         #      pickle.dump(flow.request.url, handle)
    
    
    # def response(flow):
    #      response = flow.response
    #      info = ctx.log.info
    #      info(str(response.status_code))
    #      info(str(response.headers))
    #      info(str(response.cookies))
    #      # info(str(response.encoding))
    #      detRes = chardet.detect(response.content)  # 返回编码结果
    #      charset = detRes["encoding"]
    #      info(str(charset))
    #      # text = response.content.decode(charset, "ignore")
    #      if not charset:
    #           charset = 'utf-8'
    #      text = str(response.content,encoding=charset)
    #      info(text)
    #      file = open(body_file,encoding=charset,mode="a")
    #      file.write(text)
    #      file.close()
         # with open(body_file, 'a') as handle:
         #      pickle.dump(text, handle)
  • 相关阅读:
    网站搜索功能lucene
    RabbitMQ消息队列
    zookeeper
    RPC+SOA+dubbo
    石英定时任务-quartz
    通用mapper、图片上传、nginx
    通用mapper和分类实现
    后台商品管理功能实现
    构建框架
    海量数据的并发处理
  • 原文地址:https://www.cnblogs.com/procedureMonkey/p/10320322.html
Copyright © 2011-2022 走看看