信息解读
1446.2345,1,25,16777215,1312863760,0,eff85771,42759017中几个逗号分割的数据
第一个参数 time 是弹幕出现的时间以秒数为单位。
第二个参数 mode 是弹幕的模式1..3 滚动弹幕 4底端弹幕 5顶端弹幕 6.逆向弹幕 7精准定位 8高级弹幕。
第三个参数 size 是字号, 12非常小,16特小,18小,25中,36大,45很大,64特别大。
第四个参数 color 是字体的颜色以HTML颜色的十进制为准。
第五个参数 timestamp 是Unix格式的时间戳。基准时间为 1970-1-1 08:00:00。
第六个参数 pool 是弹幕池 0普通池 1字幕池 2特殊池。
第七个参数 author 是发送者的ID,用于“屏蔽此弹幕的发送者”功能。
第八个参数 rowid 是弹幕在弹幕数据库中rowID 用于“历史弹幕”功能。
最后我们加一个 text 表示弹幕本身的内容。
实现
我们把每条弹幕的所有信息(连同视频的 cid)一起整到一个字典对象中,最终输出一个字典对象的列表。
单线程按搜索结果批量爬取
import json
import re
import requests
import json
def GetHTMLContent(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
response = requests.get(url, headers=headers)
return response.content.decode("utf-8")
def SaveDanmuList(list, cid):
reDanmu = re.compile(
r'<d p="(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?)">(.*?)</d>')
listDanmu = re.findall(reDanmu, list)
fileOutput = open("output_"+cid+".json", "w", encoding="utf-8")
listDictDanmu = []
for itemDanmu in listDanmu:
dictItemDanmu = {}
dictItemDanmu["cid"] = cid
dictItemDanmu["time"] = itemDanmu[0]
dictItemDanmu["mode"] = itemDanmu[1]
dictItemDanmu["size"] = itemDanmu[2]
dictItemDanmu["color"] = itemDanmu[3]
dictItemDanmu["timestamp"] = itemDanmu[4]
dictItemDanmu["pool"] = itemDanmu[5]
dictItemDanmu["author"] = itemDanmu[6]
dictItemDanmu["rowid"] = itemDanmu[7]
dictItemDanmu["text"] = itemDanmu[8]
listDictDanmu += [dictItemDanmu]
jsonDictDanmu = json.dumps(listDictDanmu)
fileOutput.write(json.dumps(listDictDanmu, ensure_ascii=False,
sort_keys=True, indent=4, separators=(',', ':')))
fileOutput.close()
def GetDanmuByCid(queryCid):
urlDanmuXml = 'https://comment.bilibili.com/'+queryCid+'.xml'
strDanmuXml = GetHTMLContent(urlDanmuXml)
SaveDanmuList(strDanmuXml, queryCid)
def GetCidByBid(queryBid):
urlGetCid = "https://api.bilibili.com/x/player/pagelist?bvid=" +
queryBid + "&jsonp=jsonp"
strCidJson = GetHTMLContent(urlGetCid)
jsonCid = json.loads(strCidJson)
return str(jsonCid["data"][0]["cid"])
def GetDanmuByBid(queryBid):
queryCid = GetCidByBid(queryBid)
GetDanmuByCid(queryCid)
def GetBidsBySearch(searchKeyword):
urlSearch = "https://search.bilibili.com/all?keyword=" +
searchKeyword+"&from_source=web_search"
htmlSearch = GetHTMLContent(urlSearch)
reBid = re.compile(r'//www.bilibili.com/video/(.*?)?from=search')
listBid = re.findall(reBid, htmlSearch)
return listBid
if __name__ == "__main__":
listBid = GetBidsBySearch("记忆碎片")
for itemBid in listBid:
GetDanmuByBid(itemBid)
print("Succeed :)")
多线程并行
我们开了十个线程并行,每个线程负责一页搜索结果。
import json
import re
import requests
import json
import time
import threading
def GetHTMLContent(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
response = requests.get(url, headers=headers)
return response.content.decode("utf-8")
def SaveDanmuList(list, cid):
reDanmu = re.compile(
r'<d p="(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?),(.*?)">(.*?)</d>')
listDanmu = re.findall(reDanmu, list)
fileOutput = open("output/"+cid+".json", "w", encoding="utf-8")
listDictDanmu = []
for itemDanmu in listDanmu:
dictItemDanmu = {}
dictItemDanmu["cid"] = cid
dictItemDanmu["time"] = itemDanmu[0]
dictItemDanmu["mode"] = itemDanmu[1]
dictItemDanmu["size"] = itemDanmu[2]
dictItemDanmu["color"] = itemDanmu[3]
dictItemDanmu["timestamp"] = itemDanmu[4]
dictItemDanmu["pool"] = itemDanmu[5]
dictItemDanmu["author"] = itemDanmu[6]
dictItemDanmu["rowid"] = itemDanmu[7]
dictItemDanmu["text"] = itemDanmu[8]
listDictDanmu += [dictItemDanmu]
jsonDictDanmu = json.dumps(listDictDanmu)
fileOutput.write(json.dumps(listDictDanmu, ensure_ascii=False,
sort_keys=True, indent=4, separators=(',', ':')))
fileOutput.close()
def GetDanmuByCid(queryCid):
urlDanmuXml = 'https://comment.bilibili.com/'+queryCid+'.xml'
strDanmuXml = GetHTMLContent(urlDanmuXml)
SaveDanmuList(strDanmuXml, queryCid)
def GetCidByBid(queryBid):
urlGetCid = "https://api.bilibili.com/x/player/pagelist?bvid=" +
queryBid + "&jsonp=jsonp"
strCidJson = GetHTMLContent(urlGetCid)
jsonCid = json.loads(strCidJson)
return str(jsonCid["data"][0]["cid"])
def GetDanmuByBid(queryBid):
queryCid = GetCidByBid(queryBid)
GetDanmuByCid(queryCid)
def GetBidsBySearch(searchKeyword, page=1):
urlSearch = "https://search.bilibili.com/all?keyword=" +
searchKeyword+"&from_source=web_search&page=" + str(page)
htmlSearch = GetHTMLContent(urlSearch)
reBid = re.compile(r'//www.bilibili.com/video/(.*?)?from=search')
listBid = re.findall(reBid, htmlSearch)
return listBid
def GetDanmuByBids(listBid):
for itemBid in listBid:
GetDanmuByBid(itemBid)
print("Thread finish")
if __name__ == "__main__":
threadHandles = []
timeStart = time.time()
for page in range(1, 10):
listBid = GetBidsBySearch("记忆碎片", page)
# GetDanmuByBids(listBid)
threadHandles += [threading.Thread(target=GetDanmuByBids,
name="Thread "+str(page), args=(listBid,))]
for threadHandle in threadHandles:
threadHandle.start()
for threadHandle in threadHandles:
threadHandle.join()
timeEnd = time.time()
print("timeused: ", timeEnd-timeStart)
测试结果
用电影名“记忆碎片”作为关键词进行检索测试,10 个线程,每个线程平均 18 条视频。在 10.17 秒内,获取 178 个视频的弹幕,整理后 JSON 共 8.36 MB(平均约 1KB~4 条弹幕)