zoukankan      html  css  js  c++  java
  • 爬虫并发

    #!/usr/bin/python3
    # _*_ coding:utf-8 _*_
    '''
    单线程
    '''
    import os,time
    import requests
    from bs4 import BeautifulSoup
    import uuid
    def out_wrapper(func):  # 记录执行时间的简单装饰器
        def inner_wrapper():
            start_time = time.time()
            func()
            stop_time = time.time()
            print('Used time {}'.format(stop_time-start_time))
        return inner_wrapper
    def save_flag(img,filename):    # 保存图片
        path = os.path.join('down_photos',filename)
        with open(path,'wb') as fp:
            fp.write(img)
    def download_one(url):  # 下载一个图片
        image = requests.get(url)
        save_flag(image.content,str(uuid.uuid4()))
    def user_conf():    # 返回30个图片的url
        url = 'https://unsplash.com/'
        ret = requests.get(url)
        soup = BeautifulSoup(ret.text, "html.parser")
        zzr = soup.find_all('img')
        ret = []
        num = 0
        for item in zzr:
            if item.get("src").endswith('80') and num < 30:
                num += 1
                ret.append(item.get("src"))
        return ret
    
    @out_wrapper
    def download_many():
        zzr = user_conf()
        for item in zzr:
            download_one(item)
    if __name__ == '__main__':
        download_many()
    单线程
    并发:1多进程 multiprocessing
    futures.ProcessPoolExector
    2多线程 threading
    futures.ThreadPollExecutor map
    submit和futures.as_completed
    3协成 gevent
    asyncio
    from multiprocessing import Process
    from get_photos import out_wrapper,download_one,user_conf
    @out_wrapper
    def download_many():
        zzr = user_conf()
        task_list = []
        for item in zzr:
            t = Process(target=download_one,args=(item,))
            t.start()
            task_list.append(t)
        print(task_list)
        [t.join() for t in task_list]   # 等待进程全部执行完毕(为了记录时间)
    if __name__ == '__main__':
        download_many()
    多进程
    from concurrent import futures
    from get_photos import out_wrapper,download_one,user_conf
    @out_wrapper
    def download_many():
        zzr =user_conf()
        with futures.ProcessPoolExecutor(len(zzr)) as executor:
            res = executor.map(download_one,zzr)
        return len(list(res))
    
    if __name__ == '__main__':
        download_many()
    二多进程
    import threading
    from get_photos import out_wrapper,download_one,user_conf
    @out_wrapper
    def download_many():
        zzr =user_conf()
        task_list = []
        for item in zzr:
            t = threading.Thread(target=download_one,args=(item,))
            t.start()
            task_list.append(t)
        [t.join() for t in task_list]
    
    
    if __name__ == '__main__':
        download_many()
    一多线程
    from gevent import monkey
    monkey.patch_all()
    import gevent
    from get_photos import out_wrapper,download_one,user_conf
    
    @out_wrapper
    def download_many():
        zzr =user_conf()
        jobs = [gevent.spawn(download_one,item) for item in zzr]
        gevent.joinall(jobs)
    
    
    if __name__ == '__main__':
        download_many()
    协程
    import uuid
    import asyncio
    
    import aiohttp
    from get_photos import out_wrapper,user_conf,save_flag
    async def download_one(url):
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as resp:
                save_flag(await resp.read(),str(uuid.uuid4()))
    @out_wrapper
    def download_many():
        urls =user_conf()
        loop = asyncio.get_event_loop()
        to_do = [download_one(url) for url in urls]
        wait_coro = asyncio.wait(to_do)
        res, _=loop.run_until_complete(wait_coro)
        loop.close()
        return len(res)
    
    
    if __name__ == '__main__':
        download_many()
    asyncio
    from concurrent import futures
    from get_photos import out_wrapper,download_one,user_conf
    @out_wrapper
    def download_many():
        zzr =user_conf()
        with futures.ThreadPoolExecutor(len(zzr)) as executor:
            res = executor.map(download_one,zzr)
        return len(list(res))
    多线程map
    from concurrent import futures
    from get_photos import out_wrapper,download_one,user_conf
    @out_wrapper
    def download_many():
        zzr =user_conf()
        with futures.ThreadPoolExecutor(len(zzr)) as executor:
           to_do = [executor.submit(download_one,item) for item in zzr]
           ret = [future.result() for future in futures.as_completed(to_do)]
        return ret
    
    
    if __name__ == '__main__':
        download_many()
    多线程submit和futures.as_completed
  • 相关阅读:
    1月5日学习记录||1月8日学习
    1.1学习记录|1.2日学习记录|1.3日
    RNA-seq数据为什么要去噪
    12.16日学习记录
    12.15学习记录
    transformer和bert简要学习
    关系抽取学习
    12.14周六学习记录
    12.5日学习记录
    12.4周三学习记录
  • 原文地址:https://www.cnblogs.com/zhanglin123/p/9274757.html
Copyright © 2011-2022 走看看