1.asyncio模块

@asyncio.coroutine def task(): print('start...') yield from asyncio.sleep(5) #不支持HTTP请求,支持TCP请求 #但HTTP请求就是基于TCP封装的,所以我们可以基于TCP协议发送 #HTTP请求 print('end') tasks=[task(),task(),task()] loop=asyncio.get_event_loop() loop.run_until_complete(asyncio.gather(*tasks)) loop.close()

import asyncio def task(host,url='/'): reader,writer=yield from asyncio.open_connection(host,80) request_header_content="GET %s HTTP/1.0 Host: %s " %(url,host) request_header_content=bytes(request_header_content,encoding='utf-8') writer.write(request_header_content) yield from writer.drain() text=yield from reader.read() print(host,url,text) tasks=[task('http://www.baidu.com'),task('http://www.cnblogs.com')] loop=asyncio.get_event_loop() loop. run_until_complete(asyncio.gather(*tasks)) loop.close()

import asyncio import requests @asyncio.coroutine def task(fun,*args): print(fun,args) loop=asyncio.get_event_loop() future=loop.run_in_executor(None,fun,*args) response=yield from future print(response.url,response.content) tasks=[ task(requests.get,'http://bing.com'), task(requests.get,'http://cnblogs.com') ] loop=asyncio.get_event_loop() loop.run_until_complete(asyncio.gather(*tasks)) loop.close()

#pip install aiohttp #aiohttp + asyncio import asyncio import aiohttp @asyncio.coroutine def task(url): response=yield from aiohttp.request("GET",url) print(response) tasks=[task('http://bing.com'),task('http://cnblogs.com')] loop=asyncio.get_event_loop() result=loop.run_until_complete(asyncio.gather(*tasks)) loop.close()
2.tornado模块

from tornado.httpclient import AsyncHTTPClient from tornado.httpclient import HTTPRequest from tornado import ioloop COUNT=None count=0 def handle_response(response): global count count+=1 if response.error: print('error') else: body=response.body print(body) global COUNT if count==COUNT: ioloop.IOLoop.instance().stop() def fun(): url_list=['http://www.baidu.com','http://www.cnblogs.com'] global COUNT COUNT=len(url_list) for url in url_list: http_client=AsyncHTTPClient() http_client.fetch(HTTPRequest(url),handle_response) ioloop.IOLoop.current().add_callback(fun) ioloop.IOLoop.current().start() #死循环
3.Twisted模块

#爬虫异步,提高并发 from twisted.web.client import getPage from twisted.internet import reactor,defer def one_done(args): print(args) print(type(args)) def all_done(args): print(args) print(type(args)) reactor.stop() @defer.inlineCallbacks def tasks(url): res=getPage(bytes(url,'utf-8')) res.addCallback(one_done) yield res url_list=['http://www.baidu.com','http://www.cnblogs.com'] def_list=[] for i in url_list: v=tasks(i) def_list.append(v) d=defer.DeferredList(def_list) d.addBoth(all_done) reactor.run() #死循环
4.gevent模块

#pip install greenlet #协程的模块+异步IO #pip install gevent #依赖greenlet模块 import gevent import requests from gevent.pool import Pool #协程池 from gevent import monkey monkey.patch_all() #封装成异步IO pool=Pool(3) #限制发送的个数 def task(method,url,req_kwargs): print(method,url,req_kwargs) response=requests.request(method,url,**req_kwargs) print(response.url) print(response.content) # gevent.joinall([ # gevent.spawn(task,method="GET",url="http://cnblogs.com",req_kwargs={}), # gevent.spawn(task,method="GET",url='http://bing.com',req_kwargs={}), # ]) gevent.joinall([ pool.spawn(task,method="GET",url="http://cnblogs.com",req_kwargs={}), pool.spawn(task,method="GET",url='http://bing.com',req_kwargs={}), ])

#gevent+requests import grequests requests_list=[ grequests.get('http://cnblogs.com'), grequests.get('http://bing.com'), grequests.get('http://che.com') ] response_list=grequests.map(requests_list) print(response_list)
总结:
自定义异步爬虫IO时:
#gevent->Twised->Tornado->asyncio