zoukankan      html  css  js  c++  java
  • Python爬取B站视频信息

    该文内容已失效,现已实现scrapy+scrapy-splash来爬取该网站视频及用户信息,由于B站的反爬封IP,以及网上的免费代理IP绝大部分失效,
    无法实现一个可靠的IP代理池,免费代理网站又是各种反爬,解决反爬后获取到的有效IP占比极低,不想折腾,因此视频信息暂时无法成功获取。
    github地址 https://github.com/delav/bstation

    该爬虫可以爬取B站所有视频的信息:标题,发布时间,链接,点击数,弹幕数,

    收藏数,硬币数,分享数,作者,作者性别,(生日)。输入你要爬取的起始

    视频编号(如https://www.bilibili.com/video/av15010461,输入“15010461”)

    然后输入需要爬取的数量,即可。可修改代码选择存入数据库或者Excel文件

    没有用到多进程,多线程,爬取速度有点慢。

    注意:起始视频编号的视频必须是存在的,如果输入的起始视频编号不存在,

    会出现错误,暂时没解决

      数据存入数据库,本地必须先安装MySQL。

    代码如下(由于B站源代码经常改,只要查看源代码,修改一些信息的xpath获取方式即可):

    # coding: utf-8

    # windows终端运行修改以下:
    # i = input("起始视频编号:".decode('utf-8').encode('gbk'))
    # print u"爬取完成"

    import requests
    import urllib2
    import zlib
    from lxml import etree
    import MySQLdb
    import datetime
    import os
    import xlwt
    import multiprocessing
    from xlrd import open_workbook
    from xlutils.copy import copy
    # import random
    import warnings
    import sys

    reload(sys)
    sys.setdefaultencoding('utf-8')
    warnings.filterwarnings("ignore") # 忽略警告提示
    mode_url = 'https://api.bilibili.com/x/web-interface/archive/stat?aid={}'
    title_url = 'https://www.bilibili.com/video/av{}'
    path = os.getcwd()
    file_path = path + os.sep + 'bilibili.xls'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
    'Chrome/49.0.2623.112 Safari/537.36'}


    # 获取所需要的信息列表
    def get_info(t_url, m_url):
    msg_list = []
    try:
    request = urllib2.Request(t_url, headers=headers)
    request.add_header('Accept-encoding', 'gzip')
    opener = urllib2.build_opener()
    response = opener.open(request, timeout=10)
    html = response.read()
    gzipped = response.headers.get('Content-Encoding')
    if gzipped:
    html = zlib.decompress(html, 16 + zlib.MAX_WBITS)
    # print html
    html = etree.HTML(html)
    raw_mid = html.xpath("//div[@class='u-face fl']/a/@href")
    author_url = 'https:' + raw_mid[0]
    raw_title = html.xpath("//title[@data-vue-meta='true']/text()")
    str_title = raw_title[0].encode('unicode-escape')
    real_title = str_title.split('_')[0]
    title = real_title.decode('unicode-escape')
    msg_list.append(title) # 标题
    types = html.xpath("//div[@class='tminfo']/span[last()-1]/a/text()")
    msg_list.append(types[0]) # 类型
    public_time = html.xpath("//time//i/text()")
    msg_list.append(public_time[0]) # 发布时间
    # author = html.xpath("//meta[@name='author']/@content")
    # msg_list.append(author)

    response1 = requests.get(m_url, headers=headers, verify=False, timeout=10)
    response2 = requests.get(author_url, headers=headers, verify=False)
    print "3333", response1.status_code
    print "4444", response2.status_code
    if response1.status_code == 200 and response2.status_code == 200:
    j1 = response1.json()['data']
    aid = 'www.bilibili.com/video/av' + str(j1['aid']) # 地址
    view = j1['view'] # 播放量,视频没有播放量时显示会‘--’,不是整数,会抓取失败
    danmaku = j1['danmaku'] # 弹幕
    favorite = j1['favorite'] # 收藏
    coin = j1['coin'] # 硬币
    share = j1['share'] # 分享
    j2 = response2.json()['data']['card']
    author = str(j2['name'].encode('utf-8')) # 作者
    sex = str(j2['sex'].encode('utf-8')) # 性别
    # birthday = str(j2['birthday']) # 生日
    msg_list.extend([aid, view, danmaku, favorite, coin, share, author, sex])
    except Exception, e:
    pass
    print e
    return msg_list


    # 计时装饰器
    def timer(func):
    def time_count(*args):
    start_time = datetime.datetime.now()
    func(*args)
    end_time = datetime.datetime.now()
    day = (end_time - start_time).days
    times = (end_time - start_time).seconds
    hour = times / 3600
    h = times % 3600
    minute = h / 60
    m = h % 60
    second = m
    print "爬取完成"
    print "一共用时%s天%s时%s分%s秒" % (day, hour, minute, second)
    return time_count


    # 把数据存到MySQL数据库中
    def mysql_save(my_list):
    conn = MySQLdb.connect(host="localhost",
    port=3306,
    user="root",
    passwd="729814",
    charset="utf8")
    cur = conn.cursor()
    cur.execute("create database if not exists bili")
    conn.select_db('bili')
    cur.execute("create table if not exists info (title varchar(30),"
    "types varchar(10),"
    "pub_time varchar(20),"
    "aid varchar(50),"
    "views int(20),"
    "danmaku int(15),"
    "favorite int(15),"
    "coin int(10),"
    "share int(10),"
    "author varchar(30),"
    "sex varchar(10))")
    sql = "insert into info values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
    cur.execute(sql, my_list)
    conn.commit()


    # 把数据存到Excel表格中
    def excel_save(my_list):
    try:
    first_row = ['标题', '类型', '发布时间', '地址', '播放量', '弹幕', '收藏', '硬币', '分享', '作者', '性别']
    style_bold = xlwt.easyxf('font: color-index red, bold on')
    header_style = style_bold
    if not os.path.isfile(file_path):
    w = xlwt.Workbook(encoding='utf-8')
    ws = w.add_sheet('Sheet 1')
    for x, value in enumerate(first_row):
    ws.write(0, x, value.decode('utf-8'), header_style)
    w.save(file_path)
    rexcel = open_workbook(file_path, formatting_info=True)
    rows = rexcel.sheets()[0].nrows
    excel = copy(rexcel)
    table = excel.get_sheet(0)
    for y, value in enumerate(my_list):
    if type(value) == str:
    table.write(rows, y, value.decode('utf-8'))
    else:
    table.write(rows, y, value)
    excel.save(file_path)
    except Exception, e:
    print e
    print "请先关闭bilibili.xls"


    # 主函数
    @timer
    def main(i, n):
    print "开始爬取...."
    t = 0
    while t < n:
    t += 1
    t_url = title_url.format(i)
    m_url = mode_url.format(i)
    msg_list = get_info(t_url, m_url)
    if len(msg_list) == 11:
    # 存到数据库
    mysql_save(msg_list)
    # 存到Excel
    # excel_save(msg_list)
    print "爬取第%s个成功" % t
    else:
    print "爬取%s失败失败" % t
    i += 1


    if __name__ == '__main__':
    num1 = input("起始视频编号:")
    print "---------------------"
    num2 = input("需要爬取数量:")
    print "---------------------"
    main(num1, num2)
  • 相关阅读:
    嵌入式:使用dd命令制作烧写文件
    C: 函数的名字是否受大小写影响?
    C++:互斥量C++实现,内存调试,自动锁
    linux环境ubuntu: pushd: not found
    3. 海思Hi3519A MPP从入门到精通(三 视频输入)
    详解YUV数据格式
    海思Hi3519A MPP从入门到精通(二 系统控制)
    海思Hi3519A MPP从入门到精通(一 系统概述)
    HiGV ui代码流程
    音视频学习路线
  • 原文地址:https://www.cnblogs.com/delav/p/7834943.html
Copyright © 2011-2022 走看看