zoukankan      html  css  js  c++  java
  • 爬取微信文章

    1.抓包

      打开微信网页版

      

        

        抓包:

        

        

        通过分析,我们知道,每次已请求文章只是偏移量offset不一样而已。
        还有要注意的是,每个公众号对应的cookies是不一样的,这个也是要小心的

        根据接口数据构造请求,便能获取公众号文章了!

    2.构造请求,获取数据

      

    import requests
    import json
    import time
    
    
    def parse(__biz, uin, key, pass_ticket, appmsg_token="", offset="0"):
        """
        文章信息获取
        """
        url = '?txe_eliforp/pm/moc.qq.nixiew.pm//:sptth'[::-1]
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat QBCore/3.43.901.400 QQBrowser/9.0.2524.400",
        }
        params = {
            "action": "getmsg",
            "__biz": __biz,
            "f": "json",
            "offset": str(offset),
            "count": "10",
            "is_ok": "1",
            "scene": "124",
            "uin": uin,
            "key": key,
            "pass_ticket": pass_ticket,
            "wxtoken": "",
            "appmsg_token": appmsg_token,
            "x5": "0",
        }
    
        res = requests.get(url, headers=headers, params=params, timeout=3)
        data = json.loads(res.text)
        print(data)
        # 获取信息列表
        msg_list = eval(data.get("general_msg_list")).get("list", [])
        for i in msg_list:
            # 去除文字链接
            try:
                # 文章标题
                title = i["app_msg_ext_info"]["title"].replace(',', '')
                # 文章摘要
                digest = i["app_msg_ext_info"]["digest"].replace(',', '')
                # 文章链接
                url = i["app_msg_ext_info"]["content_url"].replace("\", "").replace("http", "https")
                # 文章发布时间
                date = i["comm_msg_info"]["datetime"]
                print(title, digest, url, date)
                with open('article.csv', 'a') as f:
                    f.write(title + ',' + digest + ',' + url + ',' + str(date) + '
    ')
            except:
                pass
        # 判断是否可继续翻页 1-可以翻页  0-到底了
        if 1 == data.get("can_msg_continue", 0):
            time.sleep(3)
            parse(__biz, uin, key, pass_ticket, appmsg_token, data["next_offset"])
        else:
            print("爬取完毕")
    
    
    if __name__ == '__main__':
        # 请求参数
        __biz = input('biz: ')
        uin = input('uin: ')
        key = input('key: ')
        pass_ticket = input('passtick: ')
        # 解析函数
        parse(__biz, uin, key, pass_ticket, appmsg_token="", offset="0")

      数据:

      

    3.另外一个版本

    import requests
    
    import time
    import json
    import os
    import pdfkit
    
    
    class mp_spider(object):
    
        def __init__(self):
            self.config = pdfkit.configuration(wkhtmltopdf='C:/Program Files/wkhtmltopdf/bin/wkhtmltopdf.exe')
            self.offset = 0
            self.count = 0
            self.base_url = 'https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz=MzAwMjQwODIwNg==&f=json&offset={}&count=10&is_ok=1&scene=124&uin=MTIyOTkzMzgyMA%3D%3D&key=7cabb994f4d85a88ad37c1ec41ddde6234e76a1f1e69b178052bc99ccdf724f77700b28cea9e242cc98e517bd2537122fdc7a65a601e36f438b33e31e183f64dd9519beed36d892cc0a31855f1c649d6&pass_ticket=n6xnvQjzn4yfkjScc%2FSoVi4SkEgzf4z0airW6Ue14zIDNH98t%2Fr62k2KszUJ1qNv&wxtoken=&appmsg_token=960_mNI0W0CuVRuEpG7GsxB7f7pUUrO2CWW_iib4ww~~&x5=0&f=json'
            self.headers = {
                'Host': 'mp.weixin.qq.com',
                'Connection': 'keep-alive',
                'Accept': '*/*',
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat QBCore/3.43.884.400 QQBrowser/9.0.2524.400',
                'X-Requested-With': 'XMLHttpRequest',
                'Referer': 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MjM5MTQ4NjA3Nw==&scene=124&uin=MjA2MDM3NTU%3D&key=2b903b9a7252346947b8c8bec6a8e97ea469a66c7c55196aec680d36fef8d99bdd51ba33c76a8d0e5655e5186714a09c18bdc873bdac2350ffd215c1d3cb331a3f67f0dcc00984035cbaacc19e1ef3e2&devicetype=Windows+10&version=62060344&lang=zh_CN&a8scene=7&pass_ticket=jAFRJRtWRdJcSXta5fiYsjBqfK6vqTIYWrULumuK5sc%3D&winzoom=1',
                'Accept-Encoding': 'gzip, deflate',
                'Accept-Language': 'zh-CN,zh;q=0.8,en-us;q=0.6,en;q=0.5;q=0.4',
                'Cookie': 'wxuin=1229933820; devicetype=Windows10; version=6206021f; lang=zh_CN; pass_ticket=n6xnvQjzn4yfkjScc/SoVi4SkEgzf4z0airW6Ue14zIDNH98t/r62k2KszUJ1qNv; wap_sid2=CPyZvcoEElwzdm5YaDByenY3S2dzYlJtdXFDQVJYbmZKUERuM2I5elhMb3NxMVZqX3FCTDVYaFJ2Rkd2RktMdm9KajV3TWU5T3YyTTVfUG5zZ2llWko0cW5aMzBiY0FEQUFBfjCo9fLYBTgNQJVO'
            }
    
        def request_data(self):
            response = requests.get(self.base_url.format(self.offset), headers=self.headers)
            if 200 == response.status_code:
                self.parse_data(response.text)
    
        def parse_data(self, response_data):
    
            all_datas = json.loads(response_data)
    
            if 0 == all_datas['ret']:
                if 1 == all_datas['can_msg_continue']:
                    summy_datas = all_datas['general_msg_list']
                    datas = json.loads(summy_datas)['list']
                    for data in datas:
                        try:
                            title = data['app_msg_ext_info']['title']
                            title_child = data['app_msg_ext_info']['digest']
                            article_url = data['app_msg_ext_info']['content_url']
                            cover = data['app_msg_ext_info']['cover']
                            copyright = data['app_msg_ext_info']['copyright_stat']
                            copyright = '原创文章_' if copyright == 11 else '非原创文章_'
                            self.count = self.count + 1
                            print('第【{}】篇文章'.format(self.count), copyright, title, title_child, article_url, cover)
                            self.creat_pdf_file(article_url, '{}_{}'.format(copyright, title))
                        except:
                            continue
    
                    time.sleep(3)
                    self.offset = all_datas['next_offset']  # 下一页的偏移量
                    self.request_data()
                else:
                    exit('数据抓取完毕!')
            else:
                exit('数据抓取出错:' + all_datas['errmsg'])
    
        def creat_pdf_file(self, url, title):
            try:
                file = 'D:/store/file2/{}.pdf'.format(title)
                if not os.path.exists(file):  # 过滤掉重复文件
                    pdfkit.from_url(url, file)
    
            except Exception as e:
                print(e)
    
    
    if __name__ == '__main__':
        d = mp_spider()
        d.request_data()

    2.手机版

      

      把url和header都copy过来

    url='https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz=MzI5MzQxMDc4NQ==&f=json&offset=10&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket=YJ6cxPlh48WznjIK5i8RmTISv5IBx6jmNi31U6yUX4zzDc%2B1Z96CXE1fgDusy%2BQe&wxtoken=&appmsg_token=1007_KF456mIhy7Z%252Bq0p5c8hIvqc37qg6tuqlZ0NWtg~~&x5=0&f=json'
    
    #请求头的参数都要带
    headers
    = { 'Host': 'mp.weixin.qq.com', 'User-Agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G9350 Build/LMY48Z) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36 MicroMessenger/6.6.7.1321(0x26060736) NetType/WIFI Language/zh_CN', 'Accept-Language': 'zh-CN,en-US;q=0.8', 'Cookie': 'rewardsn=; wxtokenkey=777; wxuin=3367222304; devicetype=android-22; version=26060736; lang=zh_CN; pass_ticket=YJ6cxPlh48WznjIK5i8RmTISv5IBx6jmNi31U6yUX4zzDc+1Z96CXE1fgDusy+Qe; wap_sid2=CKD4zsUMElxJQVdKa2lUUnVZS0VhbnY2WGdhTEhfbFdMT3h5NndGM1Bjd2RpQkJUYnBUVmdVMVpmS3BOYkpQMENzS21fbXNSV3BWa2s1VV9LSkdmT2dZbEp0ZTZpdThEQUFBfjCsq8nmBTgNQJVO' }

      

      这样就拿到了数据:

      

    import requests
    
    url = 'https://mp.weixin.qq.com/mp/profile_ext' 
          '?action=home' 
          '&__biz=MzA5MTAxMjEyMQ==' 
          '&scene=126' 
          '&bizpsid=0' 
          '&devicetype=android-23' 
          '&version=2607033c' 
          '&lang=zh_CN' 
          '&nettype=WIFI' 
          '&a8scene=3' 
          '&pass_ticket=LvcLsR1hhcMXdxkZjCN49DcQiOsCdoeZdyaQP3m5rwXkXVN7Os2r9sekOOQULUpL' 
          '&wx_header=1'
    
    
    headers ='''
    Host: mp.weixin.qq.com
    
    Connection: keep-alive
    
    User-Agent: Mozilla/5.0 (Linux; Android 6.0.1; OPPO R9s Build/MMB29M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/6.2 TBS/044405 Mobile Safari/537.36 MMWEBID/5576 MicroMessenger/6.7.3.1360(0x2607033C) NetType/WIFI Language/zh_CN Process/toolsmp
    
    x-wechat-key: d2bc6fe213fd0db717e11807caca969ba1d7537e57fc89f64500a774dba05a4f1a83ae58a3d039efc6403b3fa70ebafb52cfd737b350b58d0dca366b5daf92027aaefcb094932df5a18c8764e98703dc
    
    x-wechat-uin: MTA1MzA1Nzk4Mw%3D%3D
    
    Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,image/wxpic,image/sharpp,image/apng,image/tpg,/;q=0.8
    
    Accept-Encoding: gzip, deflate
    
    Accept-Language: zh-CN,en-US;q=0.8
    
    Q-UA2: QV=3&PL=ADR&PR=WX&PP=com.tencent.mm&PPVN=6.7.3&TBSVC=43620&CO=BK&COVC=044405&PB=GE&VE=GA&DE=PHONE&CHID=0&LCID=9422&MO= OPPOR9s &RL=1080*1920&OS=6.0.1&API=23
    
    Q-GUID: edb298c301f35e6c59298f2313b788cb
    
    Q-Auth: 31045b957cf33acf31e40be2f3e71c5217597676a9729f1b
    '''
    
    
    def headers_to_dict(headers):
        """
        将字符串
        '''
        Host: mp.weixin.qq.com
        Connection: keep-alive
        Cache-Control: max-age=
        '''
        转换成字典对象
        {
            "Host": "mp.weixin.qq.com",
            "Connection": "keep-alive",
            "Cache-Control":"max-age="
        }
        :param headers: str
        :return: dict
        """
        headers = headers.split("
    ")
        d_headers = dict()
        for h in headers:
            if h:
                k, v = h.split(":", 1)
                d_headers[k] = v.strip()
        return d_headers
    
    
    # with open("weixin_history.html", "w", encoding="utf-8") as f:
    #     f.write(response.text)
    
    
    def extract_data(html_content):
        """
        从html页面中提取历史文章数据
        :param html_content 页面源代码
        :return: 历史文章列表
        """
        import re
        import html
        import json
    
        rex = "msgList = '({.*?})'"  # 正则表达
        pattern = re.compile(pattern=rex, flags=re.S)
        match = pattern.search(html_content)
        if match:
            data = match.group(1)
            data = html.unescape(data)  # 处理转义
            # print('data: {}'.format(data))
            data = json.loads(data)
            articles = data.get("list")
    
            return articles
    
    
    def crawl():
        """
         爬取文章
        :return:
        """
        response = requests.get(url, headers=headers_to_dict(headers), verify=False)
        print(response.text)
        if '<title>验证</title>' in response.text:
            raise Exception("获取微信公众号文章失败,可能是因为你的请求参数有误,请重新获取")
        data = extract_data(response.text)
        for item in data:
            print(item['app_msg_ext_info'])
    
    if __name__ == '__main__':
        crawl()

    关于微信接口可参考:

      https://blog.csdn.net/wangjiakang12306/article/details/88862462

  • 相关阅读:
    函数
    数组
    类的例题
    异常语句
    类的学习
    for的穷举、迭代
    for循环
    switch case
    反相器,扇入扇出
    T触发器,JK触发器的verilog实现
  • 原文地址:https://www.cnblogs.com/tjp40922/p/10805773.html
Copyright © 2011-2022 走看看