zoukankan      html  css  js  c++  java
  • 并发编程——线程池演示

    线程池演示

    from concurrent.futures import ThreadPoolExecutor
    import time
    
    # pool只能创建100个线程
    pool = ThreadPoolExecutor(100)
    
    
    def task(line):
        print(line)
        time.sleep(10)
    
    
    if __name__ == '__main__':
        for line in range(1000):
            pool.submit(task, line)
    
    
    
    通过并发(同步)爬虫某个网站的小视频
    import requests
    import re
    # import os
    # import uuid
    #
    #
    # # 1.发送请求,获取响应数据
    # def get_page(url):
    #     response = requests.get(url)
    #     if response.status_code == 200:
    #         return response
    #
    #
    # # 2.解析并提取主页id号
    # def parse_page(response):
    #     '''
    #     https://www.pearvideo.com/video_1630253
    #     https://www.pearvideo.com/video_1630042
    #     '''
    #     # 将所有电影的详情页id号,匹配获取,并放到列表中
    #     id_list = re.findall('href="video_(.*?)"', response.text, re.S)
    #     # print(len(id_list))
    #     id_list = list(set(id_list))
    #     # print(len(id_list))
    #     return id_list
    #
    #
    # def parse_detail(response):
    #     '''
    #     srcUrl="https://video.pearvideo.com/mp4/adshort/20191206/cont-1630253-14671892_adpkg-ad_hd.mp4"
    #     srcUrl="(.*?)"
    #     '''
    #     mp4_url = re.findall('srcUrl="(.*?)"', response.text, re.S)
    #     # print(mp4_url, 111111)
    #     if mp4_url:
    #         return mp4_url[0]
    #
    #
    # # 3.保存数据
    # def save_movie(movie_url):
    #     response = get_page(movie_url)
    #
    #     movie_dir = r'D:项目路径python13期day30梨视频'
    #     movie_path = os.path.join(
    #         movie_dir, str(uuid.uuid4()) + '.mp4'
    #     )
    #     # print(movie_path)
    #     with open(movie_path, 'wb') as f:
    #         for line in response.iter_content():
    #             f.write(line)
    #
    #
    # if __name__ == '__main__':
    #     response = get_page('https://www.pearvideo.com/')
    #
    #     # 解析提取所有电影详情页id号
    #     id_list = parse_page(response)
    #     # print(id_list)
    #
    #     # 循环拼接详情页链接
    #     for id_num in id_list:
    #         url = f'https://www.pearvideo.com/video_{id_num}'
    #         # print(url)
    #
    #         # 往详情页发送请求,
    #         detail_response = get_page(url)
    #         # print(detail_response.text)
    #
    #         # # 解析电影详情页,并提取视频的存放的地址
    #         mp4_url = parse_detail(detail_response)
    #         print(mp4_url)
    #
    #         # # 发送请求获取视频真实数据
    #         # movie_response = get_page(mp4_url)
    #
    #         # response.content
    #         save_movie(mp4_url)
    
    
    
    # 异步爬取梨视频
    import requests
    import re
    import os
    import uuid
    
    from concurrent.futures import ThreadPoolExecutor
    pool = ThreadPoolExecutor(100)
    
    
    # 1.发送请求,获取响应数据
    def get_page(url):
        print(f'发送get请求: {url}')
        response = requests.get(url)
        if response.status_code == 200:
            return response
    
    
    # 2.解析并提取主页id号
    def parse_page(response):
        '''
        https://www.pearvideo.com/video_1630253
        https://www.pearvideo.com/video_1630042
        '''
        # 将所有电影的详情页id号,匹配获取,并放到列表中
        id_list = re.findall('href="video_(.*?)"', response.text, re.S)
        # print(len(id_list))
        id_list = list(set(id_list))
        # print(len(id_list))
        return id_list
    
    
    # 解析详情页,获取视频链接
    def parse_detail(res):
        '''
        srcUrl="https://video.pearvideo.com/mp4/adshort/20191206/cont-1630253-14671892_adpkg-ad_hd.mp4"
        srcUrl="(.*?)"
        '''
        res2 = res.result()
        print(res2)
    
        movie_url = re.findall('srcUrl="(.*?)"', res2.text, re.S)
        print(movie_url)
        if movie_url:
            movie_url = movie_url[0]
            pool.submit(save_movie, movie_url)
    
    
    # 3.保存数据
    def save_movie(movie_url):
    
        # time.sleep(1)
        # 获取响应数据的过程是IO操作
        response = requests.get(movie_url)
    
        movie_dir = r'D:项目路径python13期day30梨视频'
        movie_path = os.path.join(
            movie_dir, str(uuid.uuid4()) + '.mp4'
        )
        # print(movie_path)
        with open(movie_path, 'wb') as f:
            for line in response.iter_content():
                f.write(line)
    
    
    if __name__ == '__main__':
        response = get_page('https://www.pearvideo.com/')
        id_list = parse_page(response)
        for id_num in id_list:
            # 每一个视频详情页
            url = f'https://www.pearvideo.com/video_{id_num}'
    
            # 异步提交并爬取详情页任务
            # add_done_callback(parse_detail): 将get_page任务结束后的结果,扔给parse_detail函数
            # parse_detail函数接收的是一个对象,对象中的result()就是get_page函数的返回值。
            pool.submit(get_page, url).add_done_callback(parse_detail)
    
        import datetime
    
        print(datetime.datetime.now())
        # 21:54 ---> 18:45
    
    我把月亮戳到天上 天就是我的 我把脚踩入地里 地就是我的 我亲吻你 你就是我的
  • 相关阅读:
    13 | 效率为王:脚本与数据的解耦 + Page Object模型
    关于编程与生活
    我的python学习笔记
    Tarjan学习笔记
    web----https请求过程
    JVM----堆内存设置原理
    算法----快速排序
    SpringCloud----spring security Oauth2认证解决方案
    Mysql----insert/update/delete
    课外知识----单点登录
  • 原文地址:https://www.cnblogs.com/zhulipeng-1998/p/12863900.html
Copyright © 2011-2022 走看看