import requests
from lxml import etree
import re
import time
import json
import threading
import urllib3
urllib3.disable_warnings()
url = "https://www.bilibili.com/ranking/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
}
#所有频道数据
c_data = {}
#获取频道信息
def channels_work():
res = requests.get(url=url,headers=headers).text
#匹配有效数据
data = re.findall('"channels":(.*?),"showTypes":',res)
#解析所有的频道和tid信息
channels = json.loads(data[0])
print("所有分类信息爬取ok")
print(channels)
return channels
#获取频道UP主个人首页
def channels_detail(c_url,c_name,all_up):
res = requests.get(url=c_url,headers=headers).text
tree = etree.HTML(res)
all_video = tree.xpath("//ul[@class='rank-list']/li")
for i in all_video:
#获取视频播放量和评论数和up主
up_home = "https:"+i.xpath(".//div[@class='detail']/a/@href")[0]
all_up.append(up_home)
print(c_name+"排行榜数据爬取ok")
def up_detail(url,proxies):
up = {}
up['video'] = []
#解析up的mid
lis = url.split("/")
up['up_mid'] = lis[-1]
print("开始爬取mid为"+str(up['up_mid'])+"的UP主的信息")
#up主个人信息接口
url1 = 'https://api.bilibili.com/x/space/acc/info?mid='+str(up['up_mid'])
res = requests.get(url=url1, headers=headers,proxies=proxies,verify=False).text
up_d = json.loads(res)
up['up_name'] = up_d["data"]["name"]
up['up_face'] = up_d["data"]["face"]
up['up_sex'] = up_d["data"]["sex"]
up['up_sign'] = up_d["data"]["sign"]
up['up_level'] = up_d["data"]["level"]
up['up_fans_badge'] = up_d["data"]["fans_badge"]
#关注,被关注接口
# {"code": 0, "message": "0", "ttl": 1,"data": {"mid": 18775476, "following": 128, "whisper": 0, "black": 0, "follower": 519397}}
url2 = 'https://api.bilibili.com/x/relation/stat?vmid=' + str(up['up_mid'])
res = requests.get(url=url2, headers=headers,proxies=proxies,verify=False).text
up_d = json.loads(res)
up['up_following'] = up_d['data']['following']
up['up_follower'] = up_d['data']['follower']
#播放量,阅读数
# {"code": 0, "message": "0", "ttl": 1,"data": {"archive": {"view": 37989388}, "article": {"view": 560}, "likes": 1688691}}
url3 = 'https://api.bilibili.com/x/space/upstat?mid=' + str(up['up_mid'])
res = requests.get(url=url3, headers=headers,proxies=proxies,verify=False).text
up_d = json.loads(res)
up['up_archive'] = up_d['data']['archive']['view']
up['up_likes'] = up_d['data']['likes']
up['up_article'] = up_d['data']['article']['view']
#充电数接口
# {"code":0,"data":{"display_num":0,"count":13,"total_count":994,"list":...
url4 = 'https://elec.bilibili.com/api/query.rank.do?mid=' + str(up['up_mid'])
res = requests.get(url=url4, headers=headers,proxies=proxies,verify=False).text
up_d = json.loads(res)
try:
up['up_total_count'] = up_d['data']['total_count']
except:
up['up_total_count'] = 0
#视频接口
url5 = 'https://api.bilibili.com/x/space/arc/search?mid='+str(up['up_mid'])+'&ps=1&pn=1'
res = requests.get(url=url5, headers=headers,proxies=proxies,verify=False).text
up_d = json.loads(res)
count = up_d['data']['page']['count']
up = get_video(count,up)
print(up)
def get_video(count,up):
pn = 1
while count > 0:
url5 = 'https://api.bilibili.com/x/space/arc/search?mid='+str(up['up_mid'])+'&ps=100&pn='+str(pn)
res = requests.get(url=url5, headers=headers).text
up_d = json.loads(res)
v={}
for video in up_d['data']['list']['vlist']:
v['title'] = video['title']
v['pic_url'] = video['pic']
v['comment'] = video['comment']
v['video_review'] = video['video_review']
v['created'] = video['created']
up['video'].append(v)
pn += 1
count -= 100
return up
#爬取可用代理并爬取信息
def ip_run(i,ts2):
url = 'https://www.xicidaili.com/nn/'
ip_response = requests.get(url=url,headers=headers).text
ips = re.findall("<td>(d+.d+.d+.d+)</td>", ip_response, re.S)
ports = re.findall("<td>(d+)</td>", ip_response, re.S)
for ip in(zip(ips,ports)):
proxies = {
"http":"http://"+ip[0]+":"+ip[1],
"https":"http://"+ip[0]+":"+ip[1],
}
try:
res = requests.get('https://space.bilibili.com/337312411',proxies=proxies, timeout=3)
print("ip能使用")
#如果能使用,使用此ip代理
print("开始爬取url为" + i + "的up主详细信息")
t = threading.Thread(target=up_detail, args=(i,proxies))
t.start()
time.sleep(3)
ts2.append(t)
break
except Exception as e:
print("ip不能使用")
if __name__ == '__main__':
#获取频道信息
channels = channels_work()
#获取UP主主页
all_up = []
ts = []
ts2 = []
for c in channels:
#拼接频道url
c_url = url+"all/"+str(c['tid'])+"/1/3"
c_data[c["name"]] = []
t = threading.Thread(target=channels_detail,args=(c_url,c['name'],all_up))
t.start()
ts.append(t)
for t in ts:
t.join()
for i in all_up:
ip_run(i,ts2)
for t in ts2:
t.join()
print("爬取所有数据完成")