zoukankan      html  css  js  c++  java
  • 异步多线程下载网页爬取的视频

    程序改进如下:::

    import requests
    import re # 正则模块
    import uuid # uuid.uuid(4) 可以根据时间戳生成一段世界上唯一的随机字符串

    # 导入线程池模块

    from concurrent.futures import ThreadPoolExecutor

    # 线程(os的资源,一个进程可以开多个线程),但是无法限制好线程数;
    # 线程池:帮助控制好线程数

    pool = ThreadPoolExecutor(50)


    # 爬虫三部曲

    # 1、发送请求

    def get_page(url):
    print("开始异步任务:{url}")
    response = requests.get(url)
    return response


    # 2、解析数据
    # 解析主页获取视频详情页ID
    def parse_index(res):
    response = res.result()
    res = re.findall('<a href="video_(.*?)', response.text, re.S)
    # print(res)
    # detail_url_list = []
    for m_id in res:
    # 拼接详情页url
    detail_url = 'https://www.pearvideo.com/video_' + m_id
    # 把详情页url提交给get_page
    pool.submit(get_page, detail_url).add_done_callback(parse_detail)
    # detail_url_list.append(detail_url)
    # return detail_url_list


    ##解析详情页获取视频url
    def parse_detail(res):
    response = res.result()

    movie_url = re.findall('srcUrl ="(.*?)" ', response.text, re.S)[0]
    pool.submit(get_page, movie_url).add_done_callback(save_movie)
    # return movie_url


    # 3、保存数据

    def save_movie(res):
    movie_res = res.result()

    # # 把视频写到本地


    with open(f'{uuid.uuid4()}.mp4', 'wb') as f:
    f.write(movie_res.content)
    f.flush()

    # 测试:调用函数并实现爬取

    if __name__ == '__main__': # mian+回车键

    # 往get_page发送异步请求,把结果交给parse_index
    url = 'https://www.pearvideo.com/'
    pool.submit(get_page, url).add_done_callback(parse_index) # 异步提交

     执行代码如下:

    import requests
    import re  # 正则模块
    import uuid  # uuid.uuid(4) 可以根据时间戳生成一段世界上唯一的随机字符串
    
    # 导入线程池模块
    
    from concurrent.futures import ThreadPoolExecutor
    
    # 线程(os的资源,一个进程可以开多个线程),但是无法限制好线程数;
    # 线程池:帮助控制好线程数
    
    pool = ThreadPoolExecutor(50)
    
    
    # 爬虫三部曲
    
    # 1、发送请求
    
    def get_page(url):
        print("开始异步任务:{url}")
        response = requests.get(url)
        return response
    
    
    # 2、解析数据
    # 解析主页获取视频详情页ID
    def parse_index(res):
        response = res.result()
        res1= re.findall('<a href="video_(.*?)"', response.text, re.S)
        print(res1)
        # detail_url_list = []
        for m_id in res1:
            # 拼接详情页url
            detail_url = 'https://www.pearvideo.com/video_' + m_id
           # print(detail_url)
            # 把详情页url提交给get_page
            pool.submit(get_page, detail_url).add_done_callback(parse_detail)
        # detail_url_list.append(detail_url)
        # return detail_url_list
    #
    #
    ##解析详情页获取视频url
    def parse_detail(res):
        response = res.result()
        print(response.text)
        movie_url = re.findall('srcUrl="(.*?)" ', response.text, re.S)[0]
        print(movie_url)
        pool.submit(get_page, movie_url).add_done_callback(save_movie)
        # return movie_url
    
    
    # 3、保存数据
    
    def save_movie(res):
        movie_res=res.result()
    
        # # 把视频写到本地
    
    
        with open(f'{uuid.uuid4()}.mp4', 'wb') as f:
            f.write(movie_res.content)
            f.flush()
    
    # 测试:调用函数并实现爬取
    
    if __name__ == '__main__':  # mian+回车键
    
        # 往get_page发送异步请求,把结果交给parse_index
        url = 'https://www.pearvideo.com/'
        pool.submit(get_page, url).add_done_callback(parse_index)  # 异步提交
       # print(pool.submit(get_page, url))
    

      

    数据传递过程,如下几图:

  • 相关阅读:
    评估算法优劣的核心指标是什么?
    5.垃圾回收器
    k8s-yaml详解
    curl 忽略https的ssl的证书验证
    C++ #include " " 与 <>有什么区别?
    JavaHomeWorkList-Java语言程序设计(基础篇)第十版第三章部分答案
    Java初体验
    mysql 分组取第N条记录
    spring security认证失败处理
    spring security session存储
  • 原文地址:https://www.cnblogs.com/evan0925/p/11022149.html
Copyright © 2011-2022 走看看