zoukankan      html  css  js  c++  java
  • asyncio Queue的使用例子

    import aiohttp
    import asyncio
    import async_timeout
    from urllib.parse import urljoin, urldefrag
    
    
    root_url = "http://python.org/"
    crawled_urls, url_hub = [], [root_url, "%s/sitemap.xml" % (root_url), "%s/robots.txt" % (root_url)]
    headers = {'user-agent': 'Opera/9.80 (X11; Linux x86_64; U; en) Presto/2.2.15 Version/10.10'}
    
    
    async def get_body(url):
        async with aiohttp.ClientSession() as session:
            try:
                with async_timeout.timeout(10):
                    async with session.get(url, headers=headers) as response:
                        if response.status == 200:
                            html = await response.text()
                            return {'error': '', 'html': html}
                        else:
                            return {'error': response.status, 'html': ''}
            except Exception as err:
                return {'error': err, 'html': ''}
    
    async def handle_task(task_id, work_queue):
        while not work_queue.empty():
            queue_url = await work_queue.get()
            if not queue_url in crawled_urls:
                crawled_urls.append(queue_url)
                body = await get_body(queue_url)
                if not body['error']:
                    for new_url in get_urls(body['html']):
                        if root_url in new_url and not new_url in crawled_urls:
                            work_queue.put_nowait(new_url)
                else:
                    print(f"Error: {body['error']} - {queue_url}")
    
    def remove_fragment(url):
        pure_url, frag = urldefrag(url)
        return pure_url
    
    def get_urls(html):
        new_urls = [url.split('"')[0] for url in str(html).replace("'",'"').split('href="')[1:]]
        return [urljoin(root_url, remove_fragment(new_url)) for new_url in new_urls]
    
    if __name__ == "__main__":
        q = asyncio.Queue()
        [q.put_nowait(url) for url in url_hub]    
        loop = asyncio.get_event_loop()
        tasks = [handle_task(task_id, q) for task_id in range(3)]
        loop.run_until_complete(asyncio.wait(tasks))
        loop.close()
        for u in crawled_urls:
            print(u)
        print('-'*30)
        print(len(crawled_urls))
    

      

  • 相关阅读:
    自动批改android模拟器的imei的小程序 和 下载各个版本SDK Tools及ADT
    Bulestacks模拟器Bulestacks.prop文件里中英文对照表
    HTTP Analyzer——WEB调试代理
    XCODE 添加不同IOS版本的模拟器
    在PC上运行安卓(Android)应用程序的几个方法
    二叉查找树的类模板实现
    以给定值为基分割链表
    简单二叉排序树的实现
    vector和list删除元素
    二叉树的基本操作
  • 原文地址:https://www.cnblogs.com/c-x-a/p/10571835.html
Copyright © 2011-2022 走看看