zoukankan      html  css  js  c++  java
  • 抓取 Bilibili 弹幕数据并进行数据分析

    视频地址 https://www.bilibili.com/bangumi/play/ss39462?spm_id_from=333.851.b_62696c695f7265706f72745f616e696d65.52
    弹幕地址  固定的url地址 + 视频的cid+.xml  -- 源码搜索cid
    比如:https://comment.bilibili.com/428471132.xml  

    数据获取部分
    # 完整代码
    #   获取数据
    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    
    cid = 428471132
    url = "https://comment.bilibili.com/{}.xml".format(cid)
    response  = requests.get(url)
    response.encoding = "utf-8"
    #print(response.text)
    
    #  解析数据
    soup = BeautifulSoup(response.text,"lxml")
    datas = soup.select('d')
    #print(datas[0])
    
    # 获取弹幕文字内容
    comments = [data.text for data in datas]  
    #print(comments)
    
    #  属性信息
    #  出现时间点 模式 字体 颜色 发送时间 弹幕词 用户ID  rowID 等
    info_comments = [data.get('p').split(',') for data in datas] #  获取弹幕属性信息
    #print(info_comments)
    
    # 数据存储  
    columns = ["出现时间点","模式","字体","颜色","发送时间","弹幕池","用户ID","rowID","未知参数"]
    comment_datas = pd.DataFrame(info_comments,columns=columns)
    #print(comment_datas)
    
    # 数据组合
    comment_datas["comments"] = comments
    #print(comment_datas)
    # 数据存储
    comment_datas.to_csv("comments.csv",encoding="utf-8-sig")
    print("finish...")
    数据分析部分

    一 绘制词云图
    
    

      # 加载数据
      import pandas as pd
      comment_datas = pd.read_csv("comments.csv",encoding="utf-8-sig")
      print(comment_datas)

    ##  绘制词云图
    import jieba 
    from tkinter import _flatten
    import matplotlib.pyplot as plt 
    from wordcloud import WordCloud 
    
    #   数据获取
    comments = comment_datas["comments"]
    #    分词
    jieba.load_userdict("hong.txt")  #  加载用户自定义词典
    comments_cut = comments.apply(jieba.lcut)  # 对弹幕进行分词
    #print(comments_cut)
    
    #  去除停用词
    with open("stoplist.txt","r",encoding="utf-8") as f:
        stop_words = f.read()
    stop_words += "\n"
    stop_words += ""
    comments_after = comments_cut.apply(lambda x:[i for i in x if i not in stop_words])
    #print(comments_after)
    
    #    词频统计
    results = _flatten(list(comments_after))
    #print(results)
    word_count=pd.Series(results).value_counts()
    #print(word_count)
    
    #    绘制词云  https://tool.lu/cutout/
    pic = plt.imread("aixin.jpg")  #  读取一张词云轮廓
    word_cloud = WordCloud(mask=pic,background_color='white',font_path="C:\Windows\Fonts\simhei.ttf")
    word_cloud.fit_words(word_count)
    plt.imshow(word_cloud)
    plt.axis('off')
    二 分析弹幕数量与日期,时间的关系
    #  分析弹幕数量与日期,时间的关系
    
    
    #  加载数据
    import pandas as pd
    from datetime import datetime
    
    comment_datas = pd.read_csv("comments.csv",encoding="utf-8-sig")
    comment_datas["发送时间"] = comment_datas["发送时间"].apply(lambda x :datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S'))
    #print(comment_datas)
    
    #  分析弹幕数量与日期,时间的关系
    
    userID = comment_datas["用户ID"]
    #print(userID)
    #  每个用户发送多少次弹幕
    userID_count = comment_datas["用户ID"].value_counts()
    #print(userID_count)
    
    #  求取发送次数弹幕的用户量
    userID_count_count = comment_datas["用户ID"].value_counts().value_counts()
    #print(userID_count_count)
    
    #  排序依据大小排列
    userID_count_count_sort = comment_datas["用户ID"].value_counts().value_counts().sort_index()
    print(userID_count_count_sort)
    
    #num = userID_count_count_sort[:6]
    num = userID_count_count_sort[6:]
    #num.append(userID_count_count_sort[6:].sum())
    print(num.sum())
    
    ##  绘制条形图
    import matplotlib.pyplot as plt 
    num = userID_count_count_sort[:6]
    plt.style.use('ggplot')
    plt.rcParams['font.sans-serif'] = 'SimHei'
    plt.bar(range(6),num)
    plt.xlabel("弹幕数量")
    plt.ylabel("用户数量")
    plt.title("弹幕发布数量分布图")
    plt.show()
    
    ##  弹幕数量随时间变化图
    #  去除时分秒的影响
    dates = pd.to_datetime(comment_datas["发送时间"])
    dates = [date.date() for date in dates]
    dates = pd.Series(dates)
    num = dates.value_counts().sort_index()
    #print(date_counts)
    
    #  绘制折线图
    plt.figure(figsize=(16,9))
    plt.plot(range(len(num)),num)
    #plt.xticks(range(len(num))[::7],num.index[::7],rotation=45)
    plt.xticks(range(len(num)),num.index,rotation=45)
    plt.ylabel("弹幕数量")
    plt.xlabel("日期变化")
    plt.title("弹幕发布数量随日期变化图")
    plt.show()
    
    
    ###  分析弹幕数量与日期,时间的关系 -- 以周为研究对象
    import pandas as pd
    comment_datas = pd.read_csv("comments.csv",encoding="utf-8-sig")
    #comment_datas["发送时间"]
    comment_datas["发送时间"] = comment_datas["发送时间"].apply(lambda x :datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S'))
    
    dates = pd.to_datetime(comment_datas["发送时间"])
    #print(dates)
    date = pd.Series(dates.dt.weekday)
    #print(date)
    date_count = date.value_counts().sort_index()
    #print(date_count)
    
    plt.figure(figsize=(16,9))
    plt.plot(range(len(date_count)),date_count)
    plt.xticks(range(len(date_count)),["周日","周一","周二","周三","周四","周五","周六"],rotation=45)
    plt.ylabel("弹幕数量")
    plt.xlabel("日期变化")
    plt.title("弹幕发布数量随日期变化图")
    plt.show()
    
    
    
     
  • 相关阅读:
    09.非线性-指数增长模型
    08.多元线性回归模型
    07.线性回归模型
    06.齐普夫定律验证
    05.森林火灾模型
    04.沙堆模型
    03.优先链接模型
    02.中心极限定理验证
    centos6字符
    dns解析慢 修改的参数
  • 原文地址:https://www.cnblogs.com/Skypeduty1225/p/15569270.html
Copyright © 2011-2022 走看看