zoukankan      html  css  js  c++  java
  • Python3 多线程爬取梨视频

    多线程爬取梨视频

    from threading import Thread
    import requests
    import re
    
    
    # 访问链接
    def access_page(url):
        response = requests.get(url)
        return response
    
    
    # 获取主页视频的id列表, 用来拼接视频详情页链接
    def get_video_id(homepage_data):
        id_list = re.findall('<a href="video_(.*?)" .*?>', homepage_data, re.S)
        return id_list
    
    
    # 获取视频链接列表
    def get_video_url(detail_page_data):
        video_url = re.findall('srcUrl="(.*?)"', detail_page_data, re.S)[0]
        return video_url
    
    
    # 获取视频名称
    def get_video_name(detail_page_date):
        video_name = re.findall('<h1 class="video-tt">(.*?)</h1>', detail_page_date, re.S)[0]
        # 替换其中的非法字符, 不然在作为文件名保存时候会报错
        remove_str = r'[/\:*?"<>|]'
        video_name = re.sub(remove_str, '_', video_name)
    
        return video_name
    
    
    # 保存视频
    def save(video_data, name):
        with open(f'{name}.mp4', 'wb') as f:
            f.write(video_data)
            print(f'视频[{name}]下载成功!')
    
    
    def run(id):
        # 拼接详情页链接并访问
        detail_page_url = 'https://www.pearvideo.com/video_' + id
        detail_page_data = access_page(detail_page_url).text
        # 获取视频名称和视频链接
        video_name = get_video_name(detail_page_data)
        video_url = get_video_url(detail_page_data)
        # 访问视频链接获取视频数据
        video_data = access_page(video_url).content
        # 保存视频数据
        save(video_data, video_name)
    
    
    if __name__ == '__main__':
        homepage_data = access_page('https://www.pearvideo.com/').text
        id_list = get_video_id(homepage_data)
    
        # 多线程爬取
        for id in id_list:
            t = Thread(target=run, args=(id,))
            t.start()
    
    
  • 相关阅读:
    Split Temporary Variable
    Introduce Explaining Variable
    Replace Temp with Query
    Extract Method
    自测代码的价值
    代码的坏味道
    Divergent Change & Shotgun Surgery
    Large Class
    Long Method
    Duplicated Code
  • 原文地址:https://www.cnblogs.com/bigb/p/11735900.html
Copyright © 2011-2022 走看看