zoukankan html css js c++ java

aiohttp的模板

 1 import aiohttp
 2 import asyncio
 3 import async_timeout
 4 from urllib.parse import urljoin,urldefrag
 5 
 6 root_url = 'http://python/org/'  # 开始的url
 7 crawled_urls,url_hub = [], [root_url]
 8 headers = {'user-agent': 'Opera/9.80 (X11; Linux x86_64; U; en) Presto/2.2.15 Version/10.10'}
 9 
10 async def get_body(url):
11     async with aiohttp.ClientSession() as session:
12         try:
13             with async_timeout.timeout(10): #超时时间的设定
14                 async with session.get(url,headers=headers) as response:
15                     if response.status == 200:
16                         html = await response.text()
17                         return {'error':'','html':html,'url':url}
18                     else:
19                         return {'error':response.status,'html':'','url':url}
20         except Exception as err:
21             return {'error':response.status,'html':'','url':url}
22 
23 async def handle_task(task_id,work_queue):
24     while not work_queue.empty():
25         queue_url = await work_queue.get()
26         if not queue_url in crawled_urls:
27 
28             body = await get_body(queue_url)
29             if not body['error']:
30                 crawled_urls.append(queue_url)
31                 parse(body)
32             else:
33                 print('{}爬取失败'.format(queue_url))
34 
35 
36 #解析返回的数据
37 def parse(body):
38     pass
39 
40 
41 
42 def remove_fragment(url):
43     pure_url, frag = urldefrag(url)
44     return pure_url
45 
46 #解析html，拼接新的url
47 def get_urls(html):
48     new_urls = [url.split('"')[0] for url in str(html).replace("'", '"').split('href="')[1:]]
49     return [urljoin(root_url, remove_fragment(new_url)) for new_url in new_urls]
50 
51 if __name__ == '__main__':
52     q = asyncio.Queue()  #初始化一个异步的队列
53     [q.put_nowait(url) for url in url_hub]  #从初始的url队列中遍历，把url放入到队列中
54     loop = asyncio.get_event_loop()
55     tasks = [handle_task(task_id, q) for task_id in range(3)]  #3个并发
56     loop.run_until_complete(asyncio.wait(tasks))
57     loop.close()
58     for u in crawled_urls:
59         print(u)
60     print('-' * 30)
61     print(len(crawled_urls))

View Code

查看全文

相关阅读:
java 基本数据类型的取值范围
 警惕自增的陷阱
 三元操作符的类型务必一致
 不要随便设置随机种子
 优先使用整形池
 IN、ANY、ALL与SOME
第六章-序列：字符串、列表和元组笔记
 第十二章-安全性
 第五章-数字课后答案
 第十一章-约束、视图与事务

原文地址：https://www.cnblogs.com/zhongshuiping/p/10172362.html