zoukankan      html  css  js  c++  java
  • asyncio Queue的使用例子

    import aiohttp
    import asyncio
    import async_timeout
    from urllib.parse import urljoin, urldefrag
    
    
    root_url = "http://python.org/"
    crawled_urls, url_hub = [], [root_url, "%s/sitemap.xml" % (root_url), "%s/robots.txt" % (root_url)]
    headers = {'user-agent': 'Opera/9.80 (X11; Linux x86_64; U; en) Presto/2.2.15 Version/10.10'}
    
    
    async def get_body(url):
        async with aiohttp.ClientSession() as session:
            try:
                with async_timeout.timeout(10):
                    async with session.get(url, headers=headers) as response:
                        if response.status == 200:
                            html = await response.text()
                            return {'error': '', 'html': html}
                        else:
                            return {'error': response.status, 'html': ''}
            except Exception as err:
                return {'error': err, 'html': ''}
    
    async def handle_task(task_id, work_queue):
        while not work_queue.empty():
            queue_url = await work_queue.get()
            if not queue_url in crawled_urls:
                crawled_urls.append(queue_url)
                body = await get_body(queue_url)
                if not body['error']:
                    for new_url in get_urls(body['html']):
                        if root_url in new_url and not new_url in crawled_urls:
                            work_queue.put_nowait(new_url)
                else:
                    print(f"Error: {body['error']} - {queue_url}")
    
    def remove_fragment(url):
        pure_url, frag = urldefrag(url)
        return pure_url
    
    def get_urls(html):
        new_urls = [url.split('"')[0] for url in str(html).replace("'",'"').split('href="')[1:]]
        return [urljoin(root_url, remove_fragment(new_url)) for new_url in new_urls]
    
    if __name__ == "__main__":
        q = asyncio.Queue()
        [q.put_nowait(url) for url in url_hub]    
        loop = asyncio.get_event_loop()
        tasks = [handle_task(task_id, q) for task_id in range(3)]
        loop.run_until_complete(asyncio.wait(tasks))
        loop.close()
        for u in crawled_urls:
            print(u)
        print('-'*30)
        print(len(crawled_urls))
    

      

  • 相关阅读:
    printf语句中%p ,%#x区别
    Ant执行一个含有main方法的class文件
    aix 扩展文件系统
    ORA-01653:表空间扩展失败的问题(开启表空间自动扩展)
    oracle创建表空间语句分解
    Oracle10g/11g 在SUSE/RHEL上的安装与配置
    15个实用的Linux find命令示例
    suse安装软件命令
    如何把.rar文件隐藏在一个图片内
    windows 7 中将“我的电脑”锁定到任务栏
  • 原文地址:https://www.cnblogs.com/c-x-a/p/10571835.html
Copyright © 2011-2022 走看看