zoukankan html css js c++ java

爬虫高并发之异步IO

1.asyncio模块

@asyncio.coroutine
def task():
    print('start...')
    yield from asyncio.sleep(5) #不支持HTTP请求，支持TCP请求
    #但HTTP请求就是基于TCP封装的，所以我们可以基于TCP协议发送
    #HTTP请求
    print('end')

tasks=[task(),task(),task()]

loop=asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(*tasks))
loop.close()

基本用法

import asyncio
def task(host,url='/'):

    reader,writer=yield from asyncio.open_connection(host,80)

    request_header_content="GET %s HTTP/1.0
Host: %s

" %(url,host)
    request_header_content=bytes(request_header_content,encoding='utf-8')

    writer.write(request_header_content)
    yield from writer.drain()
    text=yield from reader.read()
    print(host,url,text)

tasks=[task('http://www.baidu.com'),task('http://www.cnblogs.com')]


loop=asyncio.get_event_loop()
loop. run_until_complete(asyncio.gather(*tasks))
loop.close()

基于TCP发送HTTP请求

import asyncio
import requests

@asyncio.coroutine
def task(fun,*args):
    print(fun,args)
    loop=asyncio.get_event_loop()
    future=loop.run_in_executor(None,fun,*args)
    response=yield from future
    print(response.url,response.content)


tasks=[
    task(requests.get,'http://bing.com'),
    task(requests.get,'http://cnblogs.com')
    ]
loop=asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(*tasks))
loop.close()

asyncio+requests

#pip install aiohttp
#aiohttp + asyncio
import asyncio
import aiohttp

@asyncio.coroutine
def task(url):
    response=yield from aiohttp.request("GET",url)
    print(response)

tasks=[task('http://bing.com'),task('http://cnblogs.com')]

loop=asyncio.get_event_loop()
result=loop.run_until_complete(asyncio.gather(*tasks))
loop.close()

asyncio+aiohttp

2.tornado模块

from tornado.httpclient import AsyncHTTPClient
from tornado.httpclient import HTTPRequest
from tornado import ioloop
COUNT=None
count=0


def handle_response(response):
    global count
    count+=1
    if response.error:
        print('error')
    else:
        body=response.body
        print(body)
    global COUNT
    if count==COUNT:
        ioloop.IOLoop.instance().stop()


def fun():
    url_list=['http://www.baidu.com','http://www.cnblogs.com']
    global COUNT
    COUNT=len(url_list)
    for url in url_list:
        http_client=AsyncHTTPClient()
        http_client.fetch(HTTPRequest(url),handle_response)


ioloop.IOLoop.current().add_callback(fun)
ioloop.IOLoop.current().start()  #死循环

tornado异步IO

3.Twisted模块

#爬虫异步，提高并发
from twisted.web.client import getPage
from twisted.internet import reactor,defer


def one_done(args):
    print(args)
    print(type(args))


def all_done(args):
    print(args)
    print(type(args))
    reactor.stop()


@defer.inlineCallbacks
def tasks(url):
    res=getPage(bytes(url,'utf-8'))
    res.addCallback(one_done)
    yield res

url_list=['http://www.baidu.com','http://www.cnblogs.com']
def_list=[]
for i in url_list:
    v=tasks(i)
    def_list.append(v)

d=defer.DeferredList(def_list)
d.addBoth(all_done)
reactor.run()  #死循环

twisted异步IO

4.gevent模块

#pip install greenlet  #协程的模块+异步IO
#pip install gevent    #依赖greenlet模块

import gevent
import requests
from gevent.pool import Pool  #协程池
from gevent import monkey

monkey.patch_all() #封装成异步IO

pool=Pool(3) #限制发送的个数

def task(method,url,req_kwargs):
    print(method,url,req_kwargs)
    response=requests.request(method,url,**req_kwargs)
    print(response.url)
    print(response.content)


# gevent.joinall([
#     gevent.spawn(task,method="GET",url="http://cnblogs.com",req_kwargs={}),
#     gevent.spawn(task,method="GET",url='http://bing.com',req_kwargs={}),
# ])

gevent.joinall([
    pool.spawn(task,method="GET",url="http://cnblogs.com",req_kwargs={}),
    pool.spawn(task,method="GET",url='http://bing.com',req_kwargs={}),
])

gevent+requests

#gevent+requests
import grequests

requests_list=[

    grequests.get('http://cnblogs.com'),
    grequests.get('http://bing.com'),
    grequests.get('http://che.com')
]

response_list=grequests.map(requests_list)
print(response_list)

grequests

总结：

自定义异步爬虫IO时：

  #gevent->Twised->Tornado->asyncio

查看全文

相关阅读:
Chrome---谷歌浏览器修改用户缓存文件夹如何设置缓存路径
 Web移动端---iPhone X适配（底部栏黑横线）
vue 项目使用 webpack 构建自动获取电脑ip地址
 vue+webpack项目打包后背景图片加载不出来问题解决
 免费苹果账号（apple id）申请ios证书p12真机调试
 将Vue移动端项目打包成手机app---HBuilder
JS --- 本地保存localStorage、sessionStorage用法总结
 zTree & ckeditor &ValidateCode.jar 使用个人心得总结
 Java web实现综合查询+SQL语句拼接
 从小工到专家 2019.11.17

原文地址：https://www.cnblogs.com/lujiacheng-Python/p/10255902.html