zoukankan      html  css  js  c++  java
  • aiohttp使用队列

    获取百度的搜索结果,然后把百度的长链接,获取到真实的url

    import time
    import aiofiles
    import aiohttp
    import asyncio
    from lxml import etree
    from asyncio import Queue
    from itertools import product
    import async_timeout
    
    MAX_THREADS = 50
    
    
    class BaiduSpider:
        def __init__(self):
            self.headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36"
                              "(KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"}
            self.q = Queue()
            self.q2 = Queue()
    
        def url_generator(self):
            with open('keyword.txt', 'r', encoding='utf8') as f:
                for key in product(f, range(0, 5)):
                    yield f"https://www.baidu.com/s?wd={key[0].strip()}&pn={key[1]}"
    
        async def fetch(self, session, url):
            try:
                with async_timeout.timeout(1):
                    async with session.get(url, headers=self.headers) as resp:
                        if resp.status in [200, 201]:
                            return await resp.text()
            except Exception as e:
                pass
    
        async def work(self, session):
            while not self.q.empty():
                url = await self.q.get()
                html = await self.fetch(session, url)
                datas = await self.parser(session, html)
                self.q.task_done()
    
        async def parser(self, session, html):
            if html:
                tree = etree.HTML(html)
                datas = tree.xpath('//h3[@class="t"]/a')
                for data in datas:
                    title = data.xpath('string(.)')
                    link = data.xpath('@href')[0]
                    data = [title, link if title else '']
                    self.q2.put_nowait(data)
                await self.work2(session)
    
        async def work2(self, session):
            while not self.q2.empty():
                data = await self.q2.get()
                try:
                    with async_timeout.timeout(1):
                        async with session.get(data[1], headers=self.headers) as resp2:
                            print(resp2.url, data[0])
                            async with aiofiles.open('links.txt', 'a', encoding='utf-8') as fd:
                                if str(resp2.url) not in 'links.txt':
                                    await fd.write(f"{data[0]},{resp2.url}
    ")
                except Exception as e:
                    pass
    
        async def download(self):
            urls = self.url_generator()
            conn = aiohttp.TCPConnector(verify_ssl=False)  # 防止ssl报错
            [self.q.put_nowait(url) for url in urls]
            async with aiohttp.ClientSession(connector=conn) as session:
                tasks = [asyncio.ensure_future(self.work(session)) for _ in range(MAX_THREADS)]
                await asyncio.wait(tasks)
    
        def run(self):
            start_time = time.time()
            loop = asyncio.get_event_loop()
            tasks1 = asyncio.gather(self.download())
            loop.run_until_complete(tasks1)
            print(f'全程用时{time.time() - start_time}秒')
    
    
    if __name__ == '__main__':
        baidu = BaiduSpider()
        items = baidu.run()
    
    
  • 相关阅读:
    WCF 第八章 安全 确定替代身份(中)使用AzMan认证
    WCF 第八章 安全 总结
    WCF 第八章 安全 因特网上的安全服务(下) 其他认证模式
    WCF Membership Provider
    WCF 第八章 安全 确定替代身份(下)模仿用户
    WCF 第八章 安全 因特网上的安全服务(上)
    WCF 第九章 诊断
    HTTPS的七个误解(转载)
    WCF 第八章 安全 日志和审计
    基于比较的排序算法集
  • 原文地址:https://www.cnblogs.com/c-x-a/p/10668977.html
Copyright © 2011-2022 走看看