微博热搜第一名;B站人气超过3.5亿,满屏弹幕;腾讯视频超过600万人观看;央视新闻也发微博祝贺EDG;今天用python来爬下B站“我们是冠军”这个视频的评论并做些可视化。获取呐喊的正确姿势。
评论爬取代码:
1 import csv 2 import pprint 3 import random 4 import time 5 import requests 6 import openpyxl as opx 7 import json 8 9 f = open('我们是冠军.csv', mode='a', encoding='utf-8-sig', newline='') 10 csvWriter = csv.DictWriter(f, fieldnames=[ 11 '评论人', 12 '性别', 13 '点赞数', 14 '评论时间', 15 '评论内容', 16 ]) 17 csvWriter.writeheader() # 写入头 18 startStampTime = int(time.time() * 1000) 19 # # 新建excel文档 20 # wb = opx.Workbook() 21 # ws = wb.create_sheet(index=0) 22 # 23 # # 先写入表头 24 # ws.cell(row=1, column=1, value='评论人') 25 # ws.cell(row=1, column=2, value='性别') 26 # ws.cell(row=1, column=3, value='点赞数') 27 # ws.cell(row=1, column=4, value='评论时间') 28 # ws.cell(row=1, column=5, value='评论内容') 29 30 headers = { 31 "cookie": "_uuid=BE35640F-EB4E-F87D-53F2-7A8FD5D50E3330964infoc; buvid3=D0213B95-F001-4A46-BE4F-E921AE18EB67167647infoc; CURRENT_BLACKGAP=1; CURRENT_QUALITY=0; rpdid=|(u))ku~m)kJ0J'uYJuRRRYmk; CURRENT_FNVAL=976; video_page_version=v_old_home_17; blackside_state=1; LIVE_BUVID=AUTO1516364619569495; sid=bqyo86kv; innersign=1; PVID=2", 32 "referer": "https://www.xxx.com/video/BV12R4y1E7kn?spm_id_from=333.999.0.0", 33 "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36", 34 } 35 # # 初始计数 36 # count = 2 37 for page in range(1, 100 + 1): 38 print(f'====================正在爬取第{page}页的数据====================') 39 time.sleep(random.randint(2,5)) # 随机休眠 40 nextStampTime = int(time.time() * 1000) 41 # 请求的网址 42 url = f'https://api.xxx.com/x/v2/reply/main?callback=jQuery172046940903221511165_{startStampTime}&jsonp=jsonp&next={page}&type=1&oid=336587753&mode=3&plat=1&_={nextStampTime}' 43 # 开始请求数据 44 response = requests.get(url=url, headers=headers) 45 # print(response.text) 46 json_data = json.loads(response.text[42:-1]) 47 # 提取我们要的数据 48 data = json_data['data']['replies'] 49 print(f'第{page}页包含:' + str(len(data))) 50 for item in data: 51 # pprint.pprint(item) 52 name = item['member']['uname'] 53 sex = item['member']['sex'] 54 like = item['like'] 55 ctime = item.get('ctime') 56 # print(ctime) 57 commenttime = time.strftime('%Y-%m-%d %H:%M', time.localtime(ctime)) 58 content = item['content']['message'] 59 # print(name, sex, like, commenttime, content, sep=' | ') 60 dit = { 61 '评论人':name, 62 '性别':sex, 63 '点赞数':like, 64 '评论时间':commenttime, 65 '评论内容':content, 66 } 67 print(dit) 68 csvWriter.writerow(dit) 69 break 70 print('数据采集完毕!')
下面来做个词云:
1 import re 2 import jieba 3 import matplotlib.pyplot as plt 4 import pandas as pd 5 from wordcloud import WordCloud 6 # 读取 7 df = pd.read_csv('我们是冠军.csv') 8 9 # 删除重复记录和NA值 10 df_new = df.drop_duplicates() # 去重 11 df_new = df_new.dropna() # 删除缺失值 12 # print(df_new) 13 STOPWORDS = {"回复", "@", "我", "她", "你", "他", "了", "的", "吧", "吗", "在", "啊", "不", "也", "还", "是", 14 "说", "都", "就", "没", "做", "人", "赵薇", "被", "不是", "现在", "什么", "这", "呢", "知道", "邓", "我们", "他们", "和", "有", "", "", 15 "要", "就是", "但是", "而", "为", "自己", "中", "问题", "一个", "没有", "到", "这个", "并", "对", "点赞", "热词", "系列", "热词系列"} 16 17 # 取出评论区的词进行分词 18 textList = df_new['评论内容'].value_counts().sort_values().index.tolist() 19 # print(textList) 20 # 将列表转换成字符串 21 strText = ' '.join(textList) 22 # 正则替换一下指定次 23 newTxt = re.sub("A-Z0-9-a-z!\%[]\,。", "", strText) 24 # print(newTxt) 25 words = jieba.lcut(newTxt) 26 27 # 制作词云 28 wordcloudword = WordCloud( 29 background_color='white', 30 width = 1080, 31 height = 960, 32 # font_path = "../文悦新青年.otf", 33 font_path = 'C:/Windows/Fonts/simhei.ttf', 34 max_words = 150, 35 scale = 10, #清晰度 36 max_font_size = 100, 37 stopwords=STOPWORDS, 38 # mask = img_array, # 可以设置背景图像 39 collocations=False).generate(newTxt) 40 41 plt.imshow(wordcloudword) 42 plt.axis('off') 43 plt.show() 44 wordcloudword.to_file('wc.png')