zoukankan      html  css  js  c++  java
  • asyncio Queue的使用例子

    import aiohttp
    import asyncio
    import async_timeout
    from urllib.parse import urljoin, urldefrag
    
    
    root_url = "http://python.org/"
    crawled_urls, url_hub = [], [root_url, "%s/sitemap.xml" % (root_url), "%s/robots.txt" % (root_url)]
    headers = {'user-agent': 'Opera/9.80 (X11; Linux x86_64; U; en) Presto/2.2.15 Version/10.10'}
    
    
    async def get_body(url):
        async with aiohttp.ClientSession() as session:
            try:
                with async_timeout.timeout(10):
                    async with session.get(url, headers=headers) as response:
                        if response.status == 200:
                            html = await response.text()
                            return {'error': '', 'html': html}
                        else:
                            return {'error': response.status, 'html': ''}
            except Exception as err:
                return {'error': err, 'html': ''}
    
    async def handle_task(task_id, work_queue):
        while not work_queue.empty():
            queue_url = await work_queue.get()
            if not queue_url in crawled_urls:
                crawled_urls.append(queue_url)
                body = await get_body(queue_url)
                if not body['error']:
                    for new_url in get_urls(body['html']):
                        if root_url in new_url and not new_url in crawled_urls:
                            work_queue.put_nowait(new_url)
                else:
                    print(f"Error: {body['error']} - {queue_url}")
    
    def remove_fragment(url):
        pure_url, frag = urldefrag(url)
        return pure_url
    
    def get_urls(html):
        new_urls = [url.split('"')[0] for url in str(html).replace("'",'"').split('href="')[1:]]
        return [urljoin(root_url, remove_fragment(new_url)) for new_url in new_urls]
    
    if __name__ == "__main__":
        q = asyncio.Queue()
        [q.put_nowait(url) for url in url_hub]    
        loop = asyncio.get_event_loop()
        tasks = [handle_task(task_id, q) for task_id in range(3)]
        loop.run_until_complete(asyncio.wait(tasks))
        loop.close()
        for u in crawled_urls:
            print(u)
        print('-'*30)
        print(len(crawled_urls))
    

      

  • 相关阅读:
    redis网络资料汇总
    Spring攻略学习笔记(3.01)在Spring中启用AspectJ注解支持
    [C/C++]福尔摩斯 ABCDE*?=EDCBA
    MyBatis测试范例
    使用TileMap制作游戏地图,在cocos2dx中使用(一)
    Tomcat7 catalina.out 日志分割
    MyBatis之User.xml
    基于 SIP webRTC 架构的系统部署模型分析
    第42周星期二
    第41周星期三小结
  • 原文地址:https://www.cnblogs.com/c-x-a/p/10571835.html
Copyright © 2011-2022 走看看