zoukankan      html  css  js  c++  java
  • Spider_基础总结5--动态网页抓取--元素审查--json--字典

    # 静态网页在浏览器中展示的内容都在HTML的源码中,但主流网页使用 Javascript时,很多内容不出现在HTML的源代码中,此时仍然使用
    # requests+beautifulsoup是不能够成功的,如:
    
    # 动态网页的爬取,使用 requests+beautifulsoup是不会成功的:
    # import requests
    # from bs4 import BeautifulSoup
    # url = 'https://api-zero.livere.com/v1/comments/list?callback=jQuery112406954584941688864_1592120544800&limit=10&repSeq=4547710&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&code=1afecb1fc5912d454d80ffc6&_=1592120544802'
    # headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362'}
    # html = requests.get(url, headers= headers)
    # bs=BeautifulSoup(html.text,'html.parser')
    # comments_tags=bs.find_all('div',{'class':'reply-content-wrapper'})
    # for comment in comments_tags:
    #     print(comment.attrs['data-content'])
    
    
    # Ajax: Asynchronous Javascript And XML,异步JvvaScript和 XML; 在不重新加载整个网页的情况下对网页的某部分进行更新,节省流量,速度快。
    # 加大了 爬虫的难度。为解决这个问题,可以采用两种技术: 1)通过浏览器审查元素解析真实网页的地址。2)使用 Selenium模拟浏览器的方法。
    
    # 本节内容:通过浏览器审查元素解析真实网页的地址:
    
    # 真实网址: 
    # 第一页: https://api-zero.livere.com/v1/comments/list?callback=jQuery112406954584941688864_1592120544800&limit=10&repSeq=4547710&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&code=1afecb1fc5912d454d80ffc6&_=1592120544802
    # 第二页: https://api-zero.livere.com/v1/comments/list?callback=jQuery112408983696804040213_1592128123614&limit=10&offset=2&repSeq=4547710&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&code=1afecb1fc5912d454d80ffc6&_=1592128123621
    # 重新刷新第二页: https://api-zero.livere.com/v1/comments/list?callback=jQuery1124042695935490813275_1592128347126&limit=10&offset=2&repSeq=4547710&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&code=1afecb1fc5912d454d80ffc6&_=1592128347133
    
    # 第一页和第二页最明显的区别在于:
    # offset (虽然有其他地方也不一样,但不影响,只有 offset起决定作用),所以可以通过控制 offset来翻页。
    
    # 请求头: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362
    
    # 根据上面信息,我们将代码设计为:
    import requests
    url = """https://api-zero.livere.com/v1/comments/list?callback=jQuery112406954584941688864_1592120544800&limit=10&repSeq=4547710&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&code=1afecb1fc5912d454d80ffc6&_=1592120544802"""
    
    headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362'}
    r = requests.get(url, headers= headers)
    print (r.text)
    
    /**/ typeof jQuery112406954584941688864_1592120544800 === 'function' && jQuery112406954584941688864_1592120544800({"results":{"parents":[{"replySeq":42003685,"name":"奔跑的苹果树","memberId":"oBVoaxMyiTIYdTYmbPxXxNVrAxz4","memberIcon":"http://thirdwx.qlogo.cn/mmopen/vi_32/2CBNK5cDVstrL3W33VXJSCic8Pu3jczS4UNQtf04ZhdpVtk1PlRc8slz1lzJCakwKeFLtdGO0cqj9dDBosicWq6w/132","memberUrl":"http://www.wechat.com","memberDomain":"wechat","good":0,"bad":0,"police":0,"parentSeq":42003685,"directSeq":0,"shortUrl":null,"title":"第四章- 动态网页抓取 (解析真实地址 + selenium)","site":"http://www.santostang.com/2018/07/14/%E7%AC%AC%E5%9B%9B%E7%AB%A0%EF%BC%9A%E5%8A%A8%E6%80%81%E7%BD%91%E9%A1%B5%E6%8A%93%E5%8F%96-%E8%A7%A3%E6%9E%90%E7%9C%9F%E5%AE%9E%E5%9C%B0%E5%9D%80-selenium/","email":null,"ipAddress":"112.102.211.149","isMobile":"0","agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36","septSns":null,"targetService":null,"targetUserName":null,"info1":null,"info2":null,"info3":null,"image1":null,"image2":null,"image3":null,"link1":null,"link2":null,"link3":null,"isSecret":0,"isModified":0,"confirm":0,"subCount":0,"regdate":"2020-06-14T07:35:53.000Z","deletedDate":null,"file1":null,"file2":null,"file3":null,"additionalSeq":0,"content":"真实地址怎么获取?点击右键检查了也没发现啊。","quotationSeq":null,"quotationContent":null,"consumerSeq":1020,"livereSeq":28583,"repSeq":4547710,"memberGroupSeq":32374754,"memberSeq":32926179,"status":0,"repGroupSeq":0,"adminSeq":25413747,"deleteReason":null,"sticker":0,"version":null},{"replySeq":41888279,"name":"Creep","memberId":"oBVoaxAxqLr16sfwz1GXm9UaHVF4","memberIcon":"http://thirdwx.qlogo.cn/mmopen/vi_32/62cLVFreHtJN80DNyHnEGqrC9v42QWErXr20KB2icDCSQuNAPuYibpO7yAYTb5FY90MSpl1gLIabf7KktQibia4nNA/132","memberUrl":"http://www.wechat.com","memberDomain":"wechat","good":0,"bad":0,"police":0,"parentSeq":41888279,"directSeq":0,"shortUrl":null,"title":"第四章- 动态网页抓取 (解析真实地址 + selenium)","site":"http://www.santostang.com/2018/07/14/%E7%AC%AC%E5%9B%9B%E7%AB%A0%EF%BC%9A%E5%8A%A8%E6%80%81%E7%BD%91%E9%A1%B5%E6%8A%93%E5%8F%96-%E8%A7%A3%E6%9E%90%E7%9C%9F%E5%AE%9E%E5%9C%B0%E5%9D%80-selenium/","email":null,"ipAddress":"58.62.87.37","isMobile":"0","agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.159 Safari/537.36","septSns":null,"targetService":null,"targetUserName":null,"info1":null,"info2":null,"info3":null,"image1":null,"image2":null,"image3":null,"link1":null,"link2":null,"link3":null,"isSecret":0,"isModified":0,"confirm":0,"subCount":0,"regdate":"2020-06-01T12:20:08.000Z","deletedDate":null,"file1":null,"file2":null,"file3":null,"additionalSeq":0,"content":"学习中","quotationSeq":null,"quotationContent":null,"consumerSeq":1020,"livereSeq":28583,"repSeq":4547710,"memberGroupSeq":32349986,"memberSeq":32901188,"status":0,"repGroupSeq":0,"adminSeq":25413747,"deleteReason":null,"sticker":0,"version":null},{"replySeq":41882866,"name":"余非鱼^*^","memberId":"oBVoaxHwTIri5lNP36JXwSK2NMzg","memberIcon":"http://thirdwx.qlogo.cn/mmopen/vi_32/Q0j4TwGTfTIl3ibbP9gC9ES0zN5LIhvfzPB4zICW123JG2PawaXS9c0oiaoFDQp4RJrupZf8AolXZQH3tNI2QwWA/132","memberUrl":"http://www.wechat.com","memberDomain":"wechat","good":0,"bad":0,"police":0,"parentSeq":41882866,"directSeq":0,"shortUrl":null,"title":"第四章- 动态网页抓取 (解析真实地址 + selenium)","site":"http://www.santostang.com/2018/07/14/%E7%AC%AC%E5%9B%9B%E7%AB%A0%EF%BC%9A%E5%8A%A8%E6%80%81%E7%BD%91%E9%A1%B5%E6%8A%93%E5%8F%96-%E8%A7%A3%E6%9E%90%E7%9C%9F%E5%AE%9E%E5%9C%B0%E5%9D%80-selenium/","email":null,"ipAddress":"171.34.101.38","isMobile":"0","agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36","septSns":null,"targetService":null,"targetUserName":null,"info1":null,"info2":null,"info3":null,"image1":null,"image2":null,"image3":null,"link1":null,"link2":null,"link3":null,"isSecret":0,"isModified":0,"confirm":0,"subCount":0,"regdate":"2020-06-01T02:35:00.000Z","deletedDate":null,"file1":null,"file2":null,"file3":null,"additionalSeq":0,"content":"一起学习","quotationSeq":null,"quotationContent":null,"consumerSeq":1020,"livereSeq":28583,"repSeq":4547710,"memberGroupSeq":32348903,"memberSeq":32900097,"status":0,"repGroupSeq":0,"adminSeq":25413747,"deleteReason":null,"sticker":0,"version":null},{"replySeq":41458240,"name":"無","memberId":"UID_43B3E8679B3B9880BEB734882BCE59B3","memberIcon":"http://thirdqq.qlogo.cn/g?b=oidb&k=zuYsrwicH5EvoOeKJibGVaaQ&s=100&t=1584881994","memberUrl":"https://qq.com/","memberDomain":"qq","good":0,"bad":0,"police":0,"parentSeq":41458240,"directSeq":0,"shortUrl":null,"title":"第四章- 动态网页抓取 (解析真实地址 + selenium)","site":"http://www.santostang.com/2018/07/14/%E7%AC%AC%E5%9B%9B%E7%AB%A0%EF%BC%9A%E5%8A%A8%E6%80%81%E7%BD%91%E9%A1%B5%E6%8A%93%E5%8F%96-%E8%A7%A3%E6%9E%90%E7%9C%9F%E5%AE%9E%E5%9C%B0%E5%9D%80-selenium/","email":null,"ipAddress":"117.166.113.250","isMobile":"0","agent":"Mozilla/5.0 (Windows NT 10.0; ) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36","septSns":null,"targetService":null,"targetUserName":null,"info1":null,"info2":null,"info3":null,"image1":null,"image2":null,"image3":null,"link1":null,"link2":null,"link3":null,"isSecret":0,"isModified":0,"confirm":0,"subCount":0,"regdate":"2020-04-22T04:29:49.000Z","deletedDate":null,"file1":null,"file2":null,"file3":null,"additionalSeq":0,"content":"一句话,给我爬!!!!","quotationSeq":null,"quotationContent":null,"consumerSeq":1020,"livereSeq":28583,"repSeq":4547710,"memberGroupSeq":32277925,"memberSeq":32828481,"status":0,"repGroupSeq":0,"adminSeq":25413747,"deleteReason":null,"sticker":0,"version":null},{"replySeq":41085166,"name":"astin2020","memberId":"xiangxuexi2018@163.com","memberIcon":"https://cdn-city.livere.com/images/user_profile_4","memberUrl":"https://livere.com","memberDomain":"livere","good":0,"bad":0,"police":0,"parentSeq":41085166,"directSeq":0,"shortUrl":null,"title":"第四章- 动态网页抓取 (解析真实地址 + selenium)","site":"http://www.santostang.com/2018/07/14/%E7%AC%AC%E5%9B%9B%E7%AB%A0%EF%BC%9A%E5%8A%A8%E6%80%81%E7%BD%91%E9%A1%B5%E6%8A%93%E5%8F%96-%E8%A7%A3%E6%9E%90%E7%9C%9F%E5%AE%9E%E5%9C%B0%E5%9D%80-selenium/","email":null,"ipAddress":"125.67.134.151","isMobile":"0","agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36","septSns":null,"targetService":null,"targetUserName":null,"info1":null,"info2":null,"info3":null,"image1":null,"image2":null,"image3":null,"link1":null,"link2":null,"link3":null,"isSecret":0,"isModified":0,"confirm":0,"subCount":0,"regdate":"2020-03-22T17:13:25.000Z","deletedDate":null,"file1":null,"file2":null,"file3":null,"additionalSeq":0,"content":"为什么不多放几个回帖","quotationSeq":null,"quotationContent":null,"consumerSeq":1020,"livereSeq":28583,"repSeq":4547710,"memberGroupSeq":32204920,"memberSeq":32754725,"status":0,"repGroupSeq":0,"adminSeq":25413747,"deleteReason":null,"sticker":0,"version":null},{"replySeq":41085164,"name":"astin2020","memberId":"xiangxuexi2018@163.com","memberIcon":"https://cdn-city.livere.com/images/user_profile_4","memberUrl":"https://livere.com","memberDomain":"livere","good":0,"bad":0,"police":0,"parentSeq":41085164,"directSeq":0,"shortUrl":null,"title":"第四章- 动态网页抓取 (解析真实地址 + selenium)","site":"http://www.santostang.com/2018/07/14/%E7%AC%AC%E5%9B%9B%E7%AB%A0%EF%BC%9A%E5%8A%A8%E6%80%81%E7%BD%91%E9%A1%B5%E6%8A%93%E5%8F%96-%E8%A7%A3%E6%9E%90%E7%9C%9F%E5%AE%9E%E5%9C%B0%E5%9D%80-selenium/","email":null,"ipAddress":"125.67.134.151","isMobile":"0","agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36","septSns":null,"targetService":null,"targetUserName":null,"info1":null,"info2":null,"info3":null,"image1":null,"image2":null,"image3":null,"link1":null,"link2":null,"link3":null,"isSecret":0,"isModified":0,"confirm":0,"subCount":0,"regdate":"2020-03-22T17:13:01.000Z","deletedDate":null,"file1":null,"file2":null,"file3":null,"additionalSeq":0,"content":"哎,还要多少啊。","quotationSeq":null,"quotationContent":null,"consumerSeq":1020,"livereSeq":28583,"repSeq":4547710,"memberGroupSeq":32204920,"memberSeq":32754725,"status":0,"repGroupSeq":0,"adminSeq":25413747,"deleteReason":null,"sticker":0,"version":null},{"replySeq":41085162,"name":"astin2020","memberId":"xiangxuexi2018@163.com","memberIcon":"https://cdn-city.livere.com/images/user_profile_4","memberUrl":"https://livere.com","memberDomain":"livere","good":0,"bad":0,"police":0,"parentSeq":41085162,"directSeq":0,"shortUrl":null,"title":"第四章- 动态网页抓取 (解析真实地址 + selenium)","site":"http://www.santostang.com/2018/07/14/%E7%AC%AC%E5%9B%9B%E7%AB%A0%EF%BC%9A%E5%8A%A8%E6%80%81%E7%BD%91%E9%A1%B5%E6%8A%93%E5%8F%96-%E8%A7%A3%E6%9E%90%E7%9C%9F%E5%AE%9E%E5%9C%B0%E5%9D%80-selenium/","email":null,"ipAddress":"125.67.134.151","isMobile":"0","agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36","septSns":null,"targetService":null,"targetUserName":null,"info1":null,"info2":null,"info3":null,"image1":null,"image2":null,"image3":null,"link1":null,"link2":null,"link3":null,"isSecret":0,"isModified":0,"confirm":0,"subCount":0,"regdate":"2020-03-22T17:12:40.000Z","deletedDate":null,"file1":null,"file2":null,"file3":null,"additionalSeq":0,"content":"我不知道要多少帖子才能翻篇啊,你们没有买他的书吗","quotationSeq":null,"quotationContent":null,"consumerSeq":1020,"livereSeq":28583,"repSeq":4547710,"memberGroupSeq":32204920,"memberSeq":32754725,"status":0,"repGroupSeq":0,"adminSeq":25413747,"deleteReason":null,"sticker":0,"version":null},{"replySeq":41085159,"name":"astin2020","memberId":"xiangxuexi2018@163.com","memberIcon":"https://cdn-city.livere.com/images/user_profile_4","memberUrl":"https://livere.com","memberDomain":"livere","good":0,"bad":0,"police":0,"parentSeq":41085159,"directSeq":0,"shortUrl":null,"title":"第四章- 动态网页抓取 (解析真实地址 + selenium)","site":"http://www.santostang.com/2018/07/14/%E7%AC%AC%E5%9B%9B%E7%AB%A0%EF%BC%9A%E5%8A%A8%E6%80%81%E7%BD%91%E9%A1%B5%E6%8A%93%E5%8F%96-%E8%A7%A3%E6%9E%90%E7%9C%9F%E5%AE%9E%E5%9C%B0%E5%9D%80-selenium/","email":null,"ipAddress":"125.67.134.151","isMobile":"0","agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36","septSns":null,"targetService":null,"targetUserName":null,"info1":null,"info2":null,"info3":null,"image1":null,"image2":null,"image3":null,"link1":null,"link2":null,"link3":null,"isSecret":0,"isModified":0,"confirm":0,"subCount":0,"regdate":"2020-03-22T17:11:49.000Z","deletedDate":null,"file1":null,"file2":null,"file3":null,"additionalSeq":0,"content":"我要疯了。作者拜托你能不能改一下啊","quotationSeq":null,"quotationContent":null,"consumerSeq":1020,"livereSeq":28583,"repSeq":4547710,"memberGroupSeq":32204920,"memberSeq":32754725,"status":0,"repGroupSeq":0,"adminSeq":25413747,"deleteReason":null,"sticker":0,"version":null},{"replySeq":41085152,"name":"astin2020","memberId":"xiangxuexi2018@163.com","memberIcon":"https://cdn-city.livere.com/images/user_profile_4","memberUrl":"https://livere.com","memberDomain":"livere","good":0,"bad":0,"police":0,"parentSeq":41085152,"directSeq":0,"shortUrl":null,"title":"第四章- 动态网页抓取 (解析真实地址 + selenium)","site":"http://www.santostang.com/2018/07/14/%E7%AC%AC%E5%9B%9B%E7%AB%A0%EF%BC%9A%E5%8A%A8%E6%80%81%E7%BD%91%E9%A1%B5%E6%8A%93%E5%8F%96-%E8%A7%A3%E6%9E%90%E7%9C%9F%E5%AE%9E%E5%9C%B0%E5%9D%80-selenium/","email":null,"ipAddress":"125.67.134.151","isMobile":"0","agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36","septSns":null,"targetService":null,"targetUserName":null,"info1":null,"info2":null,"info3":null,"image1":null,"image2":null,"image3":null,"link1":null,"link2":null,"link3":null,"isSecret":0,"isModified":0,"confirm":0,"subCount":0,"regdate":"2020-03-22T17:11:22.000Z","deletedDate":null,"file1":null,"file2":null,"file3":null,"additionalSeq":0,"content":"一页到底能装多少回帖啊?","quotationSeq":null,"quotationContent":null,"consumerSeq":1020,"livereSeq":28583,"repSeq":4547710,"memberGroupSeq":32204920,"memberSeq":32754725,"status":0,"repGroupSeq":0,"adminSeq":25413747,"deleteReason":null,"sticker":0,"version":null},{"replySeq":41085150,"name":"astin2020","memberId":"xiangxuexi2018@163.com","memberIcon":"https://cdn-city.livere.com/images/user_profile_4","memberUrl":"https://livere.com","memberDomain":"livere","good":0,"bad":0,"police":0,"parentSeq":41085150,"directSeq":0,"shortUrl":null,"title":"第四章- 动态网页抓取 (解析真实地址 + selenium)","site":"http://www.santostang.com/2018/07/14/%E7%AC%AC%E5%9B%9B%E7%AB%A0%EF%BC%9A%E5%8A%A8%E6%80%81%E7%BD%91%E9%A1%B5%E6%8A%93%E5%8F%96-%E8%A7%A3%E6%9E%90%E7%9C%9F%E5%AE%9E%E5%9C%B0%E5%9D%80-selenium/","email":null,"ipAddress":"125.67.134.151","isMobile":"0","agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36","septSns":null,"targetService":null,"targetUserName":null,"info1":null,"info2":null,"info3":null,"image1":null,"image2":null,"image3":null,"link1":null,"link2":null,"link3":null,"isSecret":0,"isModified":0,"confirm":0,"subCount":0,"regdate":"2020-03-22T17:10:59.000Z","deletedDate":null,"file1":null,"file2":null,"file3":null,"additionalSeq":0,"content":"好累啊","quotationSeq":null,"quotationContent":null,"consumerSeq":1020,"livereSeq":28583,"repSeq":4547710,"memberGroupSeq":32204920,"memberSeq":32754725,"status":0,"repGroupSeq":0,"adminSeq":25413747,"deleteReason":null,"sticker":0,"version":null}],"children":[],"quotations":[]},"resultCode":200,"resultMessage":"Okay, livere"});
    


    # 只获取第一页评论:
    # 解析得到的字符串r.text(即 json字符串)可以使用json库来完成解析:
    import json
    import requests
    url = """https://api-zero.livere.com/v1/comments/list?callback=jQuery112406954584941688864_1592120544800&limit=10&repSeq=4547710&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&code=1afecb1fc5912d454d80ffc6&_=1592120544802"""
    
    headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362'}
    r = requests.get(url, headers= headers)
    json_data_dict=json.loads(r.text[r.text.find('{'):-2])    
    # 将从左大括号开始至倒数第三个字符(即将字符串末尾的括号和分号去除掉)load反序列化成字典。
    # json_data_dict是一个字典嵌套字典的数据结构(字典的value是字典)。
    # 其中外部字典的results键对应一个字典,该字典的parents键对应一个值是列表(列表的元素又是字典)。
    comments_list=json_data_dict['results']['parents']
    for comment_dict in comments_list:
        print(comment_dict['content'])
    
    # 或 :
    import json
    import requests
    import jsonpath
    url = """https://api-zero.livere.com/v1/comments/list?callback=jQuery112406954584941688864_1592120544800&limit=10&repSeq=4547710&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&code=1afecb1fc5912d454d80ffc6&_=1592120544802"""
    
    headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362'}
    r = requests.get(url, headers= headers)
    json_data_dict=json.loads(r.text[r.text.find('{'):-2])  
    comments_list=jsonpath.jsonpath(json_data_dict,'$.results.parents[*].content')
    for comment in comments_list:
        print(comment)    
    
    
        
    # 真实地址怎么获取?点击右键检查了也没发现啊。
    # 学习中
    # 一起学习
    # 一句话,给我爬!!!!
    # 为什么不多放几个回帖
    # 哎,还要多少啊。
    # 我不知道要多少帖子才能翻篇啊,你们没有买他的书吗
    # 我要疯了。作者拜托你能不能改一下啊
    # 一页到底能装多少回帖啊?
    # 好累啊
    
    # 获取两页评论:
    import json
    import requests
    
    
    def get_comments(page_num):
        global comments_list
        headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362'}
    
        url='https://api-zero.livere.com/v1/comments/list?callback=jQuery1124042695935490813275_1592128347126&limit=10&offset='
        +page_num+
        '&repSeq=4547710&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&code=1afecb1fc5912d454d80ffc6&_=1592128347133'
        r = requests.get(url, headers= headers)
        json_data_dict=json.loads(r.text[r.text.find('{'):-2])     # 将从左大括号开始至倒数第三个字符(即将字符串末尾的 ');'括号和分号去除掉)load反序列化成字典。
        # json_data_dict是一个字典嵌套字典的数据结构(字典的value是字典)。
        # 其中外部字典的results键对应一个字典,该字典的parents键对应一个值是列表(列表的元素又是字典)。
        comments_list.extend(json_data_dict['results']['parents'])  # 列表
        
    
    if __name__=='__main__':
        comments_list=[]
        for page_num in range(1,3):
            get_comments(str(page_num))
        for comment_dict in comments_list:
            print(comment_dict['content'])
    
            
            
    # 真实地址怎么获取?点击右键检查了也没发现啊。
    # 学习中
    # 一起学习
    # 一句话,给我爬!!!!
    # 为什么不多放几个回帖
    # 哎,还要多少啊。
    # 我不知道要多少帖子才能翻篇啊,你们没有买他的书吗
    # 我要疯了。作者拜托你能不能改一下啊
    # 一页到底能装多少回帖啊?
    # 好累啊
    # 还不够哦
    # 如果这样违反了你的规定,请原谅,我也是没有办法,只能帮你把水灌上
    # 不然好多代码我没有办法去按照你书上的内容操作。很郁闷
    # 主人可能忘记爬虫的跟帖必须要翻过两页才能测试啊
    # 是不是要10页才翻篇
    # 我要追加多少评论才够两页呢
    # 为什么我能看到评论呢??
    # 学习
    # 不是
    # 我是第一个来的吗?       
    
    # 回顾:
    
    # 1)--代码在 IDE里的换行:
    a='aaaaaaaaaaaaaaaaaaaaabbbbbbccc
    ggggg'
    print(a)  # aaaaaaaaaaaaaaaaaaaaabbbbbbcccggggg
    b='aaaaaaaaaaaaaaaaaaaaabbbbbbccc'
    +
    'ggggg'
    print(b)  # aaaaaaaaaaaaaaaaaaaaabbbbbbcccggggg
    
    # 2)--在输出里换行,换行符是字符串本身的一部分:
    c='aaaaaaaaaaaaaaaaaaaaabbbbbbccc
    ggggg'  
    print(c)  
    # aaaaaaaaaaaaaaaaaaaaabbbbbbccc
    # ggggg
    
    
    i=True
    if
        i==True:
        print('haha')
    
  • 相关阅读:
    七层协议&网络配置
    解决跨域问题
    拖拽 ‘vue-grid-layout’ 插件了解下
    详解vuex
    在腾讯出差的日子
    对象的解构赋值应用
    MQTT项目请求设置
    五分钟搞定Go.js
    Chrome使用video无法正常播放MP4视频的解决方案
    微信小程序地图开发总结
  • 原文地址:https://www.cnblogs.com/Collin-pxy/p/13207113.html
Copyright © 2011-2022 走看看