zoukankan      html  css  js  c++  java
  • Python爬取b站视频

    import json
    import os
    import subprocess
    import time
    
    import requests
    import re
    
    
    
    class BLBL(object):
        def __init__(self, url, cookie, referer):
            self.base_url = url
            # cookie内容
            self.cookie = cookie
            # referer内容
            self.referer = referer
            # 请求头信息
            self.accept = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3'
            self.accept_Encoding = 'gzip, deflate, br'
            self.accept_Language = 'zh-CN,zh;q=0.9,en;q=0.8'
            self.user_agent = "User-Agent:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) "
    
        def html(self):
            # 访问起始网页需添加的请求头,不加的话,得不到完整的源代码(反爬)
            base_headers = {
                'Accept': self.accept,
                'Accept-Encoding': self.accept_Encoding,
                'Accept-Language': self.accept_Language,
                'Cache-Control': 'no-cache',
                'Cookie': self.cookie,
                'Referer': self.referer,
                'Upgrade-Insecure-Requests': '1',
                'User-Agent': self.user_agent
            }
            # 请求网页
            base_response = requests.get(self.base_url, headers=base_headers)
            # 获取网页html代码
            html = base_response.text
            # print(html.headers)
            return html
    
        def xin_xi(self, html):
            result = re.findall('<script>window.__playinfo__=(.*?)</script>', html, re.S)[0]
            print(result)
            title = re.findall('<span class="tit">(.*?)</span>', html)[0].replace('/','').replace(':','').replace(' ','').strip()
            html_data = json.loads(result)
            # 音频url地址
            audio_url = html_data['data']['dash']['audio'][0]['backupUrl'][0]
            # 视频url地址
            video_url = html_data['data']['dash']['video'][0]['backupUrl'][0]
            return title, audio_url, video_url
    
        def video(self, html):
            # 获取视频名称,音频网址,视频网址
            title, audio_url, video_url = self.xin_xi(html)
            # 请求视频下载地址时需要添加的请求头
            download_headers = {
                'User-Agent': self.user_agent,
                'Referer': self.referer,
                'Orig`in': 'https://www.bilibili.com',
                'Accept': self.accept,
                'Accept-Encoding': self.accept_Encoding,
                'Accept-Language': self.accept_Language
            }
            audio_content = requests.get(audio_url,headers=download_headers).content
            video_content = requests.get(video_url,headers=download_headers).content
            with open(title + '.mp3', mode='wb') as f:
                f.write(audio_content)
            with open(title + '.mp4', mode='wb') as f:
                f.write(video_content)
            print('正在保存:', title)
            self.video_audio_merge_single(title)
        def run(self):
            html = self.html()
            self.video(html)
            print('爬取成功')# 爬下来的是两个文件 一个音频一个视频 需要合成到一块才是完整的(使用ffmpeg)提前下载安装好并配置好环境变量
    
        def video_audio_merge_single(self,video_name):
            print("视频合成开始:", video_name)
            #  ffmpeg -i video.mp4 -i audio.wav -c:v copy -c:a aac -strict experimental output.mp4
            command = 'ffmpeg -i {}.mp4 -i {}.mp3 -vcodec copy -acodec copy {}.mp4'.format(
                video_name, video_name,video_name+'(合)')
            subprocess.Popen(command, shell=True)
            time.sleep(10)
            print("视频合成结束:", video_name)
            os.remove(f'{video_name}.mp3')
            os.remove(f'{video_name}.mp4')
    
    if __name__ == '__main__':
        url= 视频播放地址  如:'https://www.bilibili.com/video/BV1yy4y1i766'
        referer = 'https://space.bilibili.com/'
        cookie = 登录后的cookie
        blbl = BLBL(url, cookie, referer)
        blbl.run()
  • 相关阅读:
    Extjs combobox
    Extjs中全键盘操作,回车跳到下一单元格
    MVC调试时遇到的URL问题
    不用插件 让Firefox 支持网页翻译
    aspNet各种模块介绍
    IntelliJ IDEA 激活
    The method getTextContent() is undefined for the type Node 错误解决
    svn服务器地址变更,客户端更改服务器地址方法
    IntelliJ IDEA中TortoiseSVN修改服务器地址的方法
    修改MyEclipse中的SVN地址
  • 原文地址:https://www.cnblogs.com/shiguanggege/p/14072636.html
Copyright © 2011-2022 走看看