zoukankan      html  css  js  c++  java
  • 爬虫高并发之异步IO

    1.asyncio模块

    @asyncio.coroutine
    def task():
        print('start...')
        yield from asyncio.sleep(5) #不支持HTTP请求,支持TCP请求
        #但HTTP请求就是基于TCP封装的,所以我们可以基于TCP协议发送
        #HTTP请求
        print('end')
    
    tasks=[task(),task(),task()]
    
    loop=asyncio.get_event_loop()
    loop.run_until_complete(asyncio.gather(*tasks))
    loop.close()
    基本用法
    import asyncio
    def task(host,url='/'):
    
        reader,writer=yield from asyncio.open_connection(host,80)
    
        request_header_content="GET %s HTTP/1.0
    Host: %s
    
    " %(url,host)
        request_header_content=bytes(request_header_content,encoding='utf-8')
    
        writer.write(request_header_content)
        yield from writer.drain()
        text=yield from reader.read()
        print(host,url,text)
    
    tasks=[task('http://www.baidu.com'),task('http://www.cnblogs.com')]
    
    
    loop=asyncio.get_event_loop()
    loop. run_until_complete(asyncio.gather(*tasks))
    loop.close()
    基于TCP发送HTTP请求
    import asyncio
    import requests
    
    @asyncio.coroutine
    def task(fun,*args):
        print(fun,args)
        loop=asyncio.get_event_loop()
        future=loop.run_in_executor(None,fun,*args)
        response=yield from future
        print(response.url,response.content)
    
    
    tasks=[
        task(requests.get,'http://bing.com'),
        task(requests.get,'http://cnblogs.com')
        ]
    loop=asyncio.get_event_loop()
    loop.run_until_complete(asyncio.gather(*tasks))
    loop.close()
    asyncio+requests
    #pip install aiohttp
    #aiohttp + asyncio
    import asyncio
    import aiohttp
    
    @asyncio.coroutine
    def task(url):
        response=yield from aiohttp.request("GET",url)
        print(response)
    
    tasks=[task('http://bing.com'),task('http://cnblogs.com')]
    
    loop=asyncio.get_event_loop()
    result=loop.run_until_complete(asyncio.gather(*tasks))
    loop.close()
    asyncio+aiohttp

    2.tornado模块

    from tornado.httpclient import AsyncHTTPClient
    from tornado.httpclient import HTTPRequest
    from tornado import ioloop
    COUNT=None
    count=0
    
    
    def handle_response(response):
        global count
        count+=1
        if response.error:
            print('error')
        else:
            body=response.body
            print(body)
        global COUNT
        if count==COUNT:
            ioloop.IOLoop.instance().stop()
    
    
    def fun():
        url_list=['http://www.baidu.com','http://www.cnblogs.com']
        global COUNT
        COUNT=len(url_list)
        for url in url_list:
            http_client=AsyncHTTPClient()
            http_client.fetch(HTTPRequest(url),handle_response)
    
    
    ioloop.IOLoop.current().add_callback(fun)
    ioloop.IOLoop.current().start()  #死循环
    tornado异步IO

    3.Twisted模块

    #爬虫异步,提高并发
    from twisted.web.client import getPage
    from twisted.internet import reactor,defer
    
    
    def one_done(args):
        print(args)
        print(type(args))
    
    
    def all_done(args):
        print(args)
        print(type(args))
        reactor.stop()
    
    
    @defer.inlineCallbacks
    def tasks(url):
        res=getPage(bytes(url,'utf-8'))
        res.addCallback(one_done)
        yield res
    
    url_list=['http://www.baidu.com','http://www.cnblogs.com']
    def_list=[]
    for i in url_list:
        v=tasks(i)
        def_list.append(v)
    
    d=defer.DeferredList(def_list)
    d.addBoth(all_done)
    reactor.run()  #死循环
    twisted异步IO

    4.gevent模块

    #pip install greenlet  #协程的模块+异步IO
    #pip install gevent    #依赖greenlet模块
    
    import gevent
    import requests
    from gevent.pool import Pool  #协程池
    from gevent import monkey
    
    monkey.patch_all() #封装成异步IO
    
    pool=Pool(3) #限制发送的个数
    
    def task(method,url,req_kwargs):
        print(method,url,req_kwargs)
        response=requests.request(method,url,**req_kwargs)
        print(response.url)
        print(response.content)
    
    
    # gevent.joinall([
    #     gevent.spawn(task,method="GET",url="http://cnblogs.com",req_kwargs={}),
    #     gevent.spawn(task,method="GET",url='http://bing.com',req_kwargs={}),
    # ])
    
    gevent.joinall([
        pool.spawn(task,method="GET",url="http://cnblogs.com",req_kwargs={}),
        pool.spawn(task,method="GET",url='http://bing.com',req_kwargs={}),
    ])
    gevent+requests
    #gevent+requests
    import grequests
    
    requests_list=[
    
        grequests.get('http://cnblogs.com'),
        grequests.get('http://bing.com'),
        grequests.get('http://che.com')
    ]
    
    response_list=grequests.map(requests_list)
    print(response_list)
    grequests

    总结:

    自定义异步爬虫IO时:

      #gevent->Twised->Tornado->asyncio

     

  • 相关阅读:
    计算器代码
    acm数论之旅(转载)---最大公约数与最小公倍数
    acm数论之旅(转载) -- 快速幂
    acm数论之旅(转载)--素数
    位运算符(转载)
    最短路问题
    并查集
    深度优先探索与广度优先探索
    ACM注意事项
    LTE Module User Documentation(翻译6)——物理误差模型、MIMO模型、天线模型
  • 原文地址:https://www.cnblogs.com/lujiacheng-Python/p/10255902.html
Copyright © 2011-2022 走看看