zoukankan      html  css  js  c++  java
  • aiohttp的模板

     1 import aiohttp
     2 import asyncio
     3 import async_timeout
     4 from urllib.parse import urljoin,urldefrag
     5 
     6 root_url = 'http://python/org/'  # 开始的url
     7 crawled_urls,url_hub = [], [root_url]
     8 headers = {'user-agent': 'Opera/9.80 (X11; Linux x86_64; U; en) Presto/2.2.15 Version/10.10'}
     9 
    10 async def get_body(url):
    11     async with aiohttp.ClientSession() as session:
    12         try:
    13             with async_timeout.timeout(10): #超时时间的设定
    14                 async with session.get(url,headers=headers) as response:
    15                     if response.status == 200:
    16                         html = await response.text()
    17                         return {'error':'','html':html,'url':url}
    18                     else:
    19                         return {'error':response.status,'html':'','url':url}
    20         except Exception as err:
    21             return {'error':response.status,'html':'','url':url}
    22 
    23 async def handle_task(task_id,work_queue):
    24     while not work_queue.empty():
    25         queue_url = await work_queue.get()
    26         if not queue_url in crawled_urls:
    27 
    28             body = await get_body(queue_url)
    29             if not body['error']:
    30                 crawled_urls.append(queue_url)
    31                 parse(body)
    32             else:
    33                 print('{}爬取失败'.format(queue_url))
    34 
    35 
    36 #解析返回的数据
    37 def parse(body):
    38     pass
    39 
    40 
    41 
    42 def remove_fragment(url):
    43     pure_url, frag = urldefrag(url)
    44     return pure_url
    45 
    46 #解析html,拼接新的url
    47 def get_urls(html):
    48     new_urls = [url.split('"')[0] for url in str(html).replace("'", '"').split('href="')[1:]]
    49     return [urljoin(root_url, remove_fragment(new_url)) for new_url in new_urls]
    50 
    51 if __name__ == '__main__':
    52     q = asyncio.Queue()  #初始化一个异步的队列
    53     [q.put_nowait(url) for url in url_hub]  #从初始的url队列中遍历,把url放入到队列中
    54     loop = asyncio.get_event_loop()
    55     tasks = [handle_task(task_id, q) for task_id in range(3)]  #3个并发
    56     loop.run_until_complete(asyncio.wait(tasks))
    57     loop.close()
    58     for u in crawled_urls:
    59         print(u)
    60     print('-' * 30)
    61     print(len(crawled_urls))
    View Code
  • 相关阅读:
    dom操作
    今天学到的知识点
    3.26随笔
    dom操作
    Ajax
    JSP、EL、JSTL
    Cookie和Session
    HttpServletResponse
    Servlet
    tomcat
  • 原文地址:https://www.cnblogs.com/zhongshuiping/p/10172362.html
Copyright © 2011-2022 走看看