协程
import asyncio
import time
#定义了一个特殊的函数
#特殊:调用后会返回一个协程对象,且函数内部的实现语句不会被立即执行
#创建一个协程对象
# async def test(num):
# print(num)
#
# c = test(10)
# print(c)
#封装一个任务对象
# async def test(num):
# print(num)
#
# c = test(10)
# #根据协程对象封装了一个任务对象
# task = asyncio.ensure_future(c)
# print(task)
#事件循环对象
async def request(url):
print('正在请求:',url)
time.sleep(2)
print('请求完毕!',url)
c1 = request('www.1.com')
task_A = asyncio.ensure_future(c1)
#创建一个事件循环对象
loop = asyncio.get_event_loop()
#将任务对象注册到该对象中并且启动事件循环
loop.run_until_complete(task_A)
任务对象绑定回调
import asyncio
import time
async def request(url):
print('正在请求:',url)
time.sleep(2)
print('请求完毕!',url)
return url
#定义一个任务对象的回调函数
#task参数表示的就是该函数被绑定的那个任务对象
def task_callback(task):
print('i am task_callback()')
print(task.result())
#task.result()返回的就是任务对象对应的特殊函数内部的返回值
c = request('www.xxx.com')
task = asyncio.ensure_future(c)
task.add_done_callback(task_callback)
loop = asyncio.get_event_loop()
loop.run_until_complete(task)
多任务异步协程
import asyncio
import time
start = time.time()
#在特殊函数内部不可以出现不支持异步模块相关的代码
async def request(url):
print('正在请求:',url)
# time.sleep(2)#time模块是不支持异步
await asyncio.sleep(2) #阻塞操作必须使用await关键字进行挂起
print('请求完毕!',url)
return url
urls = [
'www.1.com',
'www.2.com',
'www.3.com'
]
def task_callback(task):
print(task.result())
tasks = [] #多任务列表:存放多个任务对象
for url in urls:
c = request(url)
task = asyncio.ensure_future(c)
task.add_done_callback(task_callback)
tasks.append(task) #将多个任务对象装在到一个任务列表中
loop = asyncio.get_event_loop()
#多任务注册
#wait就是将任务列表中的任务对象进行挂起
loop.run_until_complete(asyncio.wait(tasks))
print(time.time()-start)
多任务异步爬虫
import asyncio
import time
import requests
start = time.time()
#在特殊函数内部不可以出现不支持异步模块相关的代码
async def request(url):
print('正在请求:',url)
response = requests.get(url)
return response.text
urls = [
'http://127.0.0.1:5000/bobo',
'http://127.0.0.1:5000/tom',
'http://127.0.0.1:5000/jay'
]
def parse(task):
page_text = task.result()
print(page_text+',请求到的数据!!!')
tasks = []
for url in urls:
c = request(url)
task = asyncio.ensure_future(c)
task.add_done_callback(parse)
tasks.append(task)
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
print(time.time()-start)
aiohttp使用
# import asyncio
# import time
# import aiohttp
# start = time.time()
在特殊函数内部不可以出现不支持异步模块相关的代码
简单的基本架构:
async def request(url):
with aiohttp.ClientSession() as s:
#s.get/post和requests中的get/post用法几乎一样:url,headers,data/prames
#在s.get中如果使用代理操作:proxy="http://ip:port"
with s.get(url) as response:
#获取字符串形式的响应数据:response.text()
#获取byte类型的:response.read()
page_text = response.text()
return page_text
在当前架构的基础上补充细节即可
细节1:在每一个with前加上async关键字
细节2:在get方法前和response.text()前加上await关键字进行手动挂起操作
# async def request(url):
# async with aiohttp.ClientSession() as s:
s.get/post和requests中的get/post用法几乎一样:url,headers,data/prames
在s.get中如果使用代理操作:proxy="http://ip:port"
# async with await s.get(url) as response:
获取字符串形式的响应数据:response.text()
获取byte类型的:response.read()
# page_text = await response.text()
# return page_text
urls = [
'http://127.0.0.1:5000/bobo',
'http://127.0.0.1:5000/tom',
'http://127.0.0.1:5000/jay',
'http://127.0.0.1:5000/bobo',
'http://127.0.0.1:5000/tom',
'http://127.0.0.1:5000/jay',
'http://127.0.0.1:5000/bobo',
'http://127.0.0.1:5000/tom',
'http://127.0.0.1:5000/jay',
]
# urls = []
# for i in range(500):
# urls.append('http://127.0.0.1:5000/bobo')
# def parse(task):
# page_text = task.result()
# print(page_text+',请求到的数据!!!')
# tasks = []
# for url in urls:
# c = request(url)
# task = asyncio.ensure_future(c)
# task.add_done_callback(parse)
# tasks.append(task)
# loop = asyncio.get_event_loop()
# loop.run_until_complete(asyncio.wait(tasks))
# print(time.time()-start)
案列
import aiohttp
import asyncio
from lxml import etree
all_titles = []
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
}
async def request(url):
async with aiohttp.ClientSession() as s:
async with await s.get(url,headers=headers) as response:
page_text = await response.text()
return page_text
urls = []
url = 'http://wz.sun0769.com/index.php/question/questionType?type=4&page=%d'
for page in range(100):
u_page = page * 30
new_url = format(url%u_page)
urls.append(new_url)
tasks = []
def parse(task):
page_text = task.result()
page_text = page_text.encode('gb2312').decode('gbk')
tree = etree.HTML(page_text)
tr_list = tree.xpath('//*[@id="morelist"]/div/table[2]//tr/td/table//tr')
for tr in tr_list:
title = tr.xpath('./td[2]/a[2]/text()')[0]
print(title)
all_titles.append(title)
for url in urls:
c = request(url)
task = asyncio.ensure_future(c)
task.add_done_callback(parse)
tasks.append(task)
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))