zoukankan      html  css  js  c++  java
  • 抓取b站视频的带有数字的评论

    import requests
    import re
    import os
    import sys
    import json
    
    aid_list=[]
    info_list =[]
    title_list = []
    
    def getAllAVList(mid, size, page):
    	for n in range(1, page+1):
    		url='http://space.bilibili.com/ajax/member/getSubmitVideos?mid='+str(mid)+'&pagesize='+str(size)+'&page='+str(n)
    		r=requests.get(url)
    		text=r.text
    		#print(text.encode('utf-8').decode('unicode_escape'))
    		#{"status":true,"data":{"tlist":{"4":{"tid":4,"count":861,"name":"游戏"}},"vlist":[{"comment":200,"typeid":17,"play":24884,"pic":"//i2.hdslb.com/bfs/archive/da1faeb8f3b08693cd440e1c5dfe75b2f612d407.jpg","subtitle":"","description":"啦啦啦","copyright":"","title":"【风笑试玩】在太空捡垃圾丨Space Scavenger 直播试玩","review":0,"author":"逆风笑","mid":2019740,"is_union_video":0,"created":1592293193,"length":"17:16","video_review":316,"is_pay":0,"favorites":355,"aid":626106116,"is_steins_gate":0,"hide_click":false}],"count":861,"pages":861}}
    		json_text=json.loads(text)
    		for item in json_text['data']['vlist']:
    			aid_list.append(item['aid'])
    			title_list.append(item['title'])
    	print(aid_list)
    
    def getAllCommentList(item):
    	info_list.append('begin %s'%title_list[aid_list.index(item)])
    	print('begin %s'%title_list[aid_list.index(item)])
    	url='http://api.bilibili.com/x/reply?type=1&oid='+str(item)+'&pn=1&nohot=1&sort=0'
    	r=requests.get(url)
    	numtext=r.text
    	json_text=json.loads(numtext)
    	commentsNum=json_text['data']['page']['count']
    	page=commentsNum//20+2
    	for n in range(1, page):
    		url='https://api.bilibili.com/x/v2/reply?jsonp=jsonp&pn='+str(n)+'&type=1&oid='+str(item)+'&sort=1&nohot=1'
    		req=requests.get(url)
    		text=req.text
    		json_text_list=json.loads(text)
    		#print(json_text_list)
    		for i in json_text_list['data']['replies']:
    			# info_list.append([i['member']['uname'], i['content']['message']])
    			if re.findall(r'^[u4E00-u9FA5A-Za-z0-9]*[0-9][u4E00-u9FA5A-Za-z0-9]*$', i['content']['message']):
    				info_list.append(str(i['member']['uname']+': '+i['content']['message']))
    		# if input('continue, yes or no?')=='y':
    		# 	continue
    		# else:
    		# 	break
    
    def saveTxt(filename, filecontent):
    	filename=str(filename)+'.txt'
    	with open(filename, 'w', encoding='utf-8') as txt:
    		for content in filecontent:
    			# txt.write(content[0]+' '+content[1].replace('
    ', '')+'
    
    ')
    			txt.write(content+'
    ')
    			#print('文件写入中')
    
    if __name__ == '__main__':
    	getAllAVList(2019740, 1, 50)
    	for item in aid_list:
    		# info_list.clear()
    		getAllCommentList(item)
    	saveTxt('abc', info_list)
    

    程序的主要思路是借助b站的api进行数据的提取。首先,流程是视频信息接口→视频id→评论接口的评论数量→页数→访问评论字符串→通过正则表达式筛选出含有数字的评论,写在文件中。
    json.loads函数将字符串转成字典格式。
    遇见字符串里有/uxxxx的字符(utf-8字符编码),想转成其原本的意思,使用string.encode('utf-8').decode('unicode_escape')
    中文、字母和数字的正则表示法是[u4E00-u9FA5A-Za-z0-9]

    参考链接:

    1

    爬虫如何抓取b站评论,弹幕等内容? - 肥肥杨的回答 - 知乎

    2

    python: 关于解决'u'开头的字符串转中文的方法

  • 相关阅读:
    关于在MAC上进行 LARAVEL 环境 Homestead 安装过程记录
    js 贷款计算器
    js 实现阶乘
    js 两点间距离函数
    composer Your requirements could not be resolved to an installable set of packages
    vue 项目优化记录 持续更新...
    vue 项目打包
    vue 真机调试页面出现空白
    vue 真机调试
    谈谈-Android状态栏的编辑
  • 原文地址:https://www.cnblogs.com/tellw/p/13158653.html
Copyright © 2011-2022 走看看