zoukankan      html  css  js  c++  java
  • 怎么用Python爬取抖音小视频? 资深程序员都这样爬取的(附源码)

    简介

    抖音,是一款可以拍短视频的音乐创意短视频社交软件,该软件于2016年9月上线,是一个专注年轻人的15秒音乐短视频社区。用户可以通过这款软件选择歌曲,拍摄15秒的音乐短视频,形成自己的作品。此APP已在Android各大应用商店和APP Store均有上线。

    今天咱们就用Python爬取抖音视频

    准备:

    环境:Python3.6+Windows

    IDE:你开行就好,喜欢用哪个就用哪个

    模块:

    1 from splinter.driver.webdriver.chrome import Options, Chrome
    2 from splinter.browser import Browser
    3 from contextlib import closing
    4 import requests, json, time, re, os, sys, time
    5 from bs4 import BeautifulSoup

    获得视频播放地址

    • 查询的用户ID

    • 视频名字列表

    • 视频链接列表

    • 用户昵称

     1     def get_video_urls(self, user_id):
     2 
     3 +        video_names = []
     4 +        video_urls = []
     5 +        unique_id = ''
     6 +        while unique_id != user_id:
     7 +            search_url = 'https://api.amemv.com/aweme/v1/discover/search/?cursor=0&keyword=%s&count=10&type=1&retry_type=no_retry&iid=17900846586&device_id=34692364855&ac=wifi&channel=xiaomi&aid=1128&app_name=aweme&version_code=162&version_name=1.6.2&device_platform=android&ssmix=a&device_type=MI+5&device_brand=Xiaomi&os_api=24&os_version=7.0&uuid=861945034132187&openudid=dc451556fc0eeadb&manifest_version_code=162&resolution=1080*1920&dpi=480&update_version_code=1622' % user_id
     8 +            req = requests.get(url = search_url, verify = False)
     9 +            html = json.loads(req.text)
    10 +            aweme_count = html['user_list'][0]['user_info']['aweme_count']
    11 +            uid = html['user_list'][0]['user_info']['uid']
    12 +            nickname = html['user_list'][0]['user_info']['nickname']
    13 +            unique_id = html['user_list'][0]['user_info']['unique_id']
    14 +        user_url = 'https://www.douyin.com/aweme/v1/aweme/post/?user_id=%s&max_cursor=0&count=%s' % (uid, aweme_count)
    15 +        req = requests.get(url = user_url, verify = False)
    16 +        html = json.loads(req.text)
    17 +        i = 1
    18 +        for each in html['aweme_list']:
    19 +            share_desc = each['share_info']['share_desc']
    20 +            if '抖音-原创音乐短视频社区' == share_desc:
    21 +                video_names.append(str(i) + '.mp4')
    22 +                i += 1
    23 +            else:
    24 +                video_names.append(share_desc + '.mp4')
    25 +            video_urls.append(each['share_info']['share_url'])
    26 +
    27 +        return video_names, video_urls, nickname

    获得带水印的视频播放地址

    • video_url:带水印的视频播放地址

    • download_url: 带水印的视频下载地址

    1     def get_download_url(self, video_url):
    2 
    3 +        req = requests.get(url = video_url, verify = False)
    4 +        bf = BeautifulSoup(req.text, 'lxml')
    5 +        script = bf.find_all('script')[-1]
    6 +        video_url_js = re.findall('var data = [(.+)];', str(script))[0]
    7 +        video_html = json.loads(video_url_js)
    8 +        download_url = video_html['video']['play_addr']['url_list'][0]
    9 +        return download_url

    视频下载

    • video_url: 带水印的视频地址

    • video_name: 视频名

    • watermark_flag: 是否下载不带水印的视频

     1     def video_downloader(self, video_url, video_name, watermark_flag=True):
     2 +        """
     3 +        视频下载
     4 +        Parameters:
     5 +            video_url: 带水印的视频地址
     6 +            video_name: 视频名
     7 +            watermark_flag: 是否下载不带水印的视频
     8 +        Returns:
     9 +10 +        """
    11 +        size = 0
    12 +        if watermark_flag == True:
    13 +            video_url = self.remove_watermark(video_url)
    14 +        else:
    15 +            video_url = self.get_download_url(video_url)
    16 +        with closing(requests.get(video_url, stream=True, verify = False)) as response:
    17 +            chunk_size = 1024
    18 +            content_size = int(response.headers['content-length']) 
    19 +            if response.status_code == 200:
    20 +                sys.stdout.write('  [文件大小]:%0.2f MB
    ' % (content_size / chunk_size / 1024))
    21 +
    22 +                with open(video_name, "wb") as file:  
    23 +                    for data in response.iter_content(chunk_size = chunk_size):
    24 +                        file.write(data)
    25 +                        size += len(data)
    26 +                        file.flush()
    27 +
    28 +                        sys.stdout.write('  [下载进度]:%.2f%%' % float(size / content_size * 100) + '
    ')
    29 +                        sys.stdout.flush()

    获得无水印的视频播放地址

     1     def remove_watermark(self, video_url):
     2 +        """
     3 +        获得无水印的视频播放地址
     4 +        Parameters:
     5 +            video_url: 带水印的视频地址
     6 +        Returns:
     7 +            无水印的视频下载地址
     8 +        """
     9 +        self.driver.visit('http://douyin.iiilab.com/')
    10 +        self.driver.find_by_tag('input').fill(video_url)
    11 +        self.driver.find_by_xpath('//button[@class="btn btn-default"]').click()
    12 +        html = self.driver.find_by_xpath('//div[@class="thumbnail"]/div/p')[0].html
    13 +        bf = BeautifulSoup(html, 'lxml')
    14 +        return bf.find('a').get('href')

    下载视频

     1     def run(self):
     2 +        """
     3 +        运行函数
     4 +        Parameters:
     5 +            None
     6 +        Returns:
     7 +            None
     8 +        """
     9 +        self.hello()
    10 +        user_id = input('请输入ID(例如40103580):')
    11 +        video_names, video_urls, nickname = self.get_video_urls(user_id)
    12 +        if nickname not in os.listdir():
    13 +            os.mkdir(nickname)
    14 +        print('视频下载中:共有%d个作品!
    ' % len(video_urls))
    15 +        for num in range(len(video_urls)):
    16 +            print('  解析第%d个视频链接 [%s] 中,请稍后!
    ' % (num+1, video_urls[num]))
    17 +            if '\' in video_names[num]:
    18 +                video_name = video_names[num].replace('\', '')
    19 +            elif '/' in video_names[num]:
    20 +                video_name = video_names[num].replace('/', '')
    21 +            else:
    22 +                video_name = video_names[num]
    23 +            self.video_downloader(video_urls[num], os.path.join(nickname, video_name))
    24 +            print('
    ')
    25 +
    26 +        print('下载完成!')

    全部代码

      1 +# -*- coding:utf-8 -*-
      2 
      3 +Python学习交流群:125240963
      4 +Python学习交流群:125240963
      5 +Python学习交流群:125240963
      6 
      7 +from splinter.driver.webdriver.chrome import Options, Chrome
      8 +from splinter.browser import Browser
      9 +from contextlib import closing
     10 +import requests, json, time, re, os, sys, time
     11 +from bs4 import BeautifulSoup
     12 +
     13  class DouYin(object):
     14     def __init__(self, width = 500, height = 300):
     15 +        """
     16 +        抖音App视频下载
     17 +        """
     18 +        # 无头浏览器
     19 +        chrome_options = Options()
     20 +        chrome_options.add_argument('user-agent="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"')
     21 +        self.driver = Browser(driver_name='chrome', executable_path='D:/chromedriver', options=chrome_options, headless=True)
     22 +
     23     def get_video_urls(self, user_id):
     24 +        """
     25 +        获得视频播放地址
     26 +        Parameters:
     27 +            user_id:查询的用户ID
     28 +        Returns:
     29 +            video_names: 视频名字列表
     30 +            video_urls: 视频链接列表
     31 +            nickname: 用户昵称
     32 +        """
     33 +        video_names = []
     34 +        video_urls = []
     35 +        unique_id = ''
     36 +        while unique_id != user_id:
     37 +            search_url = 'https://api.amemv.com/aweme/v1/discover/search/?cursor=0&keyword=%s&count=10&type=1&retry_type=no_retry&iid=17900846586&device_id=34692364855&ac=wifi&channel=xiaomi&aid=1128&app_name=aweme&version_code=162&version_name=1.6.2&device_platform=android&ssmix=a&device_type=MI+5&device_brand=Xiaomi&os_api=24&os_version=7.0&uuid=861945034132187&openudid=dc451556fc0eeadb&manifest_version_code=162&resolution=1080*1920&dpi=480&update_version_code=1622' % user_id
     38 +            req = requests.get(url = search_url, verify = False)
     39 +            html = json.loads(req.text)
     40 +            aweme_count = html['user_list'][0]['user_info']['aweme_count']
     41 +            uid = html['user_list'][0]['user_info']['uid']
     42 +            nickname = html['user_list'][0]['user_info']['nickname']
     43 +            unique_id = html['user_list'][0]['user_info']['unique_id']
     44 +        user_url = 'https://www.douyin.com/aweme/v1/aweme/post/?user_id=%s&max_cursor=0&count=%s' % (uid, aweme_count)
     45 +        req = requests.get(url = user_url, verify = False)
     46 +        html = json.loads(req.text)
     47 +        i = 1
     48 +        for each in html['aweme_list']:
     49 +            share_desc = each['share_info']['share_desc']
     50 +            if '抖音-原创音乐短视频社区' == share_desc:
     51 +                video_names.append(str(i) + '.mp4')
     52 +                i += 1
     53 +            else:
     54 +                video_names.append(share_desc + '.mp4')
     55 +            video_urls.append(each['share_info']['share_url'])
     56 +
     57 +        return video_names, video_urls, nickname
     58 +
     59     def get_download_url(self, video_url):
     60 +        """
     61 +        获得带水印的视频播放地址
     62 +        Parameters:
     63 +            video_url:带水印的视频播放地址
     64 +        Returns:
     65 +            download_url: 带水印的视频下载地址
     66 +        """
     67 +        req = requests.get(url = video_url, verify = False)
     68 +        bf = BeautifulSoup(req.text, 'lxml')
     69 +        script = bf.find_all('script')[-1]
     70 +        video_url_js = re.findall('var data = [(.+)];', str(script))[0]
     71 +        video_html = json.loads(video_url_js)
     72 +        download_url = video_html['video']['play_addr']['url_list'][0]
     73 +        return download_url
     74 +
     75     def video_downloader(self, video_url, video_name, watermark_flag=True):
     76 +        """
     77 +        视频下载
     78 +        Parameters:
     79 +            video_url: 带水印的视频地址
     80 +            video_name: 视频名
     81 +            watermark_flag: 是否下载不带水印的视频
     82 +        Returns:
     83 +            无
     84 +        """
     85 +        size = 0
     86 +        if watermark_flag == True:
     87 +            video_url = self.remove_watermark(video_url)
     88 +        else:
     89 +            video_url = self.get_download_url(video_url)
     90 +        with closing(requests.get(video_url, stream=True, verify = False)) as response:
     91 +            chunk_size = 1024
     92 +            content_size = int(response.headers['content-length']) 
     93 +            if response.status_code == 200:
     94 +                sys.stdout.write('  [文件大小]:%0.2f MB
    ' % (content_size / chunk_size / 1024))
     95 +
     96 +                with open(video_name, "wb") as file:  
     97 +                    for data in response.iter_content(chunk_size = chunk_size):
     98 +                        file.write(data)
     99 +                        size += len(data)
    100 +                        file.flush()
    101 +
    102 +                        sys.stdout.write('  [下载进度]:%.2f%%' % float(size / content_size * 100) + '
    ')
    103 +                        sys.stdout.flush()
    104 +
    105 +
    106     def remove_watermark(self, video_url):
    107 +        """
    108 +        获得无水印的视频播放地址
    109 +        Parameters:
    110 +            video_url: 带水印的视频地址
    111 +        Returns:
    112 +            无水印的视频下载地址
    113 +        """
    114 +        self.driver.visit('http://douyin.iiilab.com/')
    115 +        self.driver.find_by_tag('input').fill(video_url)
    116 +        self.driver.find_by_xpath('//button[@class="btn btn-default"]').click()
    117 +        html = self.driver.find_by_xpath('//div[@class="thumbnail"]/div/p')[0].html
    118 +        bf = BeautifulSoup(html, 'lxml')
    119 +        return bf.find('a').get('href')
    120 +
    121     def run(self):
    122 +        """
    123 +        运行函数
    124 +        Parameters:
    125 +            None
    126 +        Returns:
    127 +            None
    128 +        """
    129 +        self.hello()
    130 +        user_id = input('请输入ID(例如40103580):')
    131 +        video_names, video_urls, nickname = self.get_video_urls(user_id)
    132 +        if nickname not in os.listdir():
    133 +            os.mkdir(nickname)
    134 +        print('视频下载中:共有%d个作品!
    ' % len(video_urls))
    135 +        for num in range(len(video_urls)):
    136 +            print('  解析第%d个视频链接 [%s] 中,请稍后!
    ' % (num+1, video_urls[num]))
    137 +            if '\' in video_names[num]:
    138 +                video_name = video_names[num].replace('\', '')
    139 +            elif '/' in video_names[num]:
    140 +                video_name = video_names[num].replace('/', '')
    141 +            else:
    142 +                video_name = video_names[num]
    143 +            self.video_downloader(video_urls[num], os.path.join(nickname, video_name))
    144 +            print('
    ')
    145 +
    146 +        print('下载完成!')
    147 +
    148     def hello(self):
    149 +        """
    150 +        打印欢迎界面
    151 +        Parameters:
    152 +            None
    153 +        Returns:
    154 +            None
    155 +        """
    156 +        print('*' * 100)
    157 +        print('				抖音App视频下载小助手')
    158 +        print('		作者:Python学习交流群:125240963')
    159 +        print('*' * 100)
    160 +
    161 +
    162 +if __name__ == '__main__':
    163 +    douyin = DouYin()
    164 +    douyin.run()
  • 相关阅读:
    Hadoop命令大全
    Cube中时间维度
    无法锁定管理目录(/var/lib/dpkg/),是否有其他进程正占用它?
    IE6、IE7、IE8、FF对空标签块状元素解释的不同点
    SSIS导出平面文件数据带_x003C_none_x003E的问题
    用DB2 Runtime Client实现Apache Derby 数据库ODBC编程
    区块链技术探索
    JS原型对象
    this关键字
    消息认证码
  • 原文地址:https://www.cnblogs.com/pythonfm/p/9097792.html
Copyright © 2011-2022 走看看