zoukankan      html  css  js  c++  java
  • 爬虫并发

    #!/usr/bin/python3
    # _*_ coding:utf-8 _*_
    '''
    单线程
    '''
    import os,time
    import requests
    from bs4 import BeautifulSoup
    import uuid
    def out_wrapper(func):  # 记录执行时间的简单装饰器
        def inner_wrapper():
            start_time = time.time()
            func()
            stop_time = time.time()
            print('Used time {}'.format(stop_time-start_time))
        return inner_wrapper
    def save_flag(img,filename):    # 保存图片
        path = os.path.join('down_photos',filename)
        with open(path,'wb') as fp:
            fp.write(img)
    def download_one(url):  # 下载一个图片
        image = requests.get(url)
        save_flag(image.content,str(uuid.uuid4()))
    def user_conf():    # 返回30个图片的url
        url = 'https://unsplash.com/'
        ret = requests.get(url)
        soup = BeautifulSoup(ret.text, "html.parser")
        zzr = soup.find_all('img')
        ret = []
        num = 0
        for item in zzr:
            if item.get("src").endswith('80') and num < 30:
                num += 1
                ret.append(item.get("src"))
        return ret
    
    @out_wrapper
    def download_many():
        zzr = user_conf()
        for item in zzr:
            download_one(item)
    if __name__ == '__main__':
        download_many()
    单线程
    并发:1多进程 multiprocessing
    futures.ProcessPoolExector
    2多线程 threading
    futures.ThreadPollExecutor map
    submit和futures.as_completed
    3协成 gevent
    asyncio
    from multiprocessing import Process
    from get_photos import out_wrapper,download_one,user_conf
    @out_wrapper
    def download_many():
        zzr = user_conf()
        task_list = []
        for item in zzr:
            t = Process(target=download_one,args=(item,))
            t.start()
            task_list.append(t)
        print(task_list)
        [t.join() for t in task_list]   # 等待进程全部执行完毕(为了记录时间)
    if __name__ == '__main__':
        download_many()
    多进程
    from concurrent import futures
    from get_photos import out_wrapper,download_one,user_conf
    @out_wrapper
    def download_many():
        zzr =user_conf()
        with futures.ProcessPoolExecutor(len(zzr)) as executor:
            res = executor.map(download_one,zzr)
        return len(list(res))
    
    if __name__ == '__main__':
        download_many()
    二多进程
    import threading
    from get_photos import out_wrapper,download_one,user_conf
    @out_wrapper
    def download_many():
        zzr =user_conf()
        task_list = []
        for item in zzr:
            t = threading.Thread(target=download_one,args=(item,))
            t.start()
            task_list.append(t)
        [t.join() for t in task_list]
    
    
    if __name__ == '__main__':
        download_many()
    一多线程
    from gevent import monkey
    monkey.patch_all()
    import gevent
    from get_photos import out_wrapper,download_one,user_conf
    
    @out_wrapper
    def download_many():
        zzr =user_conf()
        jobs = [gevent.spawn(download_one,item) for item in zzr]
        gevent.joinall(jobs)
    
    
    if __name__ == '__main__':
        download_many()
    协程
    import uuid
    import asyncio
    
    import aiohttp
    from get_photos import out_wrapper,user_conf,save_flag
    async def download_one(url):
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as resp:
                save_flag(await resp.read(),str(uuid.uuid4()))
    @out_wrapper
    def download_many():
        urls =user_conf()
        loop = asyncio.get_event_loop()
        to_do = [download_one(url) for url in urls]
        wait_coro = asyncio.wait(to_do)
        res, _=loop.run_until_complete(wait_coro)
        loop.close()
        return len(res)
    
    
    if __name__ == '__main__':
        download_many()
    asyncio
    from concurrent import futures
    from get_photos import out_wrapper,download_one,user_conf
    @out_wrapper
    def download_many():
        zzr =user_conf()
        with futures.ThreadPoolExecutor(len(zzr)) as executor:
            res = executor.map(download_one,zzr)
        return len(list(res))
    多线程map
    from concurrent import futures
    from get_photos import out_wrapper,download_one,user_conf
    @out_wrapper
    def download_many():
        zzr =user_conf()
        with futures.ThreadPoolExecutor(len(zzr)) as executor:
           to_do = [executor.submit(download_one,item) for item in zzr]
           ret = [future.result() for future in futures.as_completed(to_do)]
        return ret
    
    
    if __name__ == '__main__':
        download_many()
    多线程submit和futures.as_completed
  • 相关阅读:
    几种二叉可并堆(详细)
    几种二叉可并堆(详细)
    Winsock编程基础介绍 . 分类: VC++ 2013-09-14 17:30 512人阅读 评论(0) 收藏
    VS2005+WINDDK+Driver Studio 3.2个人总结 分类: VC++ 2013-09-14 17:26 593人阅读 评论(0) 收藏
    用DDK开发的9054驱动 . 分类: windows驱动程序WDM 2013-09-14 17:24 625人阅读 评论(0) 收藏
    arm-linux-gcc下载与安装 分类: arm-linux-Ubuntu 2013-09-11 14:12 698人阅读 评论(0) 收藏
    u盘安装ubuntu10.04 server.txt 分类: arm-linux-Ubuntu 2013-09-11 14:10 882人阅读 评论(1) 收藏
    Makefile的规则 分类: arm-linux-Ubuntu 2013-09-11 14:09 517人阅读 评论(0) 收藏
    学习了LINUX下用C语言遍历文件夹,一些心得 分类: arm-linux-Ubuntu 2013-09-11 14:09 885人阅读 评论(1) 收藏
    dm642的中断定时器 分类: DSP 2013-09-10 14:35 660人阅读 评论(0) 收藏
  • 原文地址:https://www.cnblogs.com/zhanglin123/p/9274757.html
Copyright © 2011-2022 走看看