zoukankan      html  css  js  c++  java
  • b站评论区爬取

    import requests
    
    import time
    
    from bs4 import BeautifulSoup
    
    import json
    
    # 必要的库
    
    
    
    def get_html(url):
    
        headers = {
    
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    
         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) appleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    
        }
    
        # 模拟访问信息
        r = requests.get(url, timeout=30, headers=headers)
    
        r.raise_for_status()
    
        r.endcodding = 'utf-8'
    
        return r.text
    
    
    
    def get_content(url):
    
            comments = []
    
            html = get_html(url)
    
            try:
    
                s = json.loads(html)
    
            except:
    
                 print("jsonload error")
    
    
    
            num = len(s['data']['replies']) # 获取每页评论栏的数量
    
    
    
            i = 0
    
            while i < num:
    
                comment = s['data']['replies'][i] # 获取每栏信息
    
                InfoDict = {}  # 存储每组信息字典
                InfoDict['用户名'] = comment['member']['uname']
    
                InfoDict['uid号'] = comment['member']['mid']
    
                InfoDict['评论内容'] = comment['content']['message']
    
                InfoDict['性别'] = comment['member']['sex']
    
                comments.append(InfoDict)
    
                i+=1
            return comments
    
    def Out2File(dict):
        with open('评论区爬取.txt', 'a+', encoding='utf-8') as f:
            for user in dict:
                try:
    
                    f.write('姓名:{}\t uid:{}\t 性别:{}\t \n 评论内容:{}\t \n'.format(user['用户名'], user['uid号'], user['性别'], user['评论内容']))
    
    
    
                except:
    
                 print("out2File error")
    
            print('当前页面保存完成')
    
    
    
    
    
    e = 0
    
    page = 1
    
    while e == 0:
    
        url = "https://api.bilibili.com/x/v2/reply/main?&jsonp=jsonp&next=" + str(page) + "&type=1&oid=677870443&mode=3&plat=1&_=1641278727643"
    
    
        try:
    
            print()
    
            content = get_content(url)
    
            print("page:", page)
            Out2File(content)
    
            page = page + 1
    
    
    
    # 为了降低被封ip的风险,每爬10页便歇5秒。
            if page % 10 == 0: # 求余数
    
                time.sleep(5)
        except:
                e = 1

    参考视频:https://www.bilibili.com/video/BV1fu411d7Hy?from=search&seid=3483579157564497530&spm_id_from=333.337.0.0

  • 相关阅读:
    Kubernetes 服务入口管理 Traefik Ingress Controller
    flex的titlewindow如何自适应浏览器的宽度和高度
    JQuery的事件中使用this
    jQuery控制 input 不可编辑
    jquery 操作 input显示或者隐藏
    Word 创建模板
    HTML转PDF
    SQL server 自增主键重新从1开始
    读取 .properties文件到数据库
    根据json生成java实体类文件
  • 原文地址:https://www.cnblogs.com/520520520zl/p/15762472.html
Copyright © 2011-2022 走看看