zoukankan      html  css  js  c++  java
  • 使用aiohttp实现爬虫

    import asyncio
    import re
    
    import aiohttp
    import aiomysql
    from pyquery import PyQuery
    
    
    stopping = False
    start_url = "http://www.jobbole.com/"
    waitting_urls = []
    seen_urls = set()
    
    sem = asyncio.Semaphore(3)
    
    
    async def fetch(url, session):
        async with sem:
            await asyncio.sleep(1)
            try:
                async with session.get(url) as response:
                    print("Status:", response.status)
                    print("Content-type:", response.headers['content-type'])
                    if response.status in [200, 201]:
                        content = await response.text()
                        return content
            except Exception as e:
                print(e)
    
    
    def extract_urls(html):
        pq = PyQuery(html)
        for link in pq.items("a"):
            url = link.attr("href")
            if url and url.startswith("/caijing") and url not in seen_urls:
                waitting_urls.append("http://www.jobbole.com{}".format(url))
    
    
    async def init_urls(url, session):
        html = await fetch(url, session)
        seen_urls.add(url)
        extract_urls(html)
    
    
    async def article_handler(url, session, pool):
        # 获取文章详情并解析入库
        html = await fetch(url, session)
        seen_urls.add(url)
        extract_urls(html)
        pq = PyQuery(html)
        title = pq("title").text()
        async with pool.acquire() as conn:
            async with conn.cursor() as cur:
                insert_sql = "insert into artest(title) values ('{}')".format(title)
                await cur.execute(insert_sql)
    
    
    async def consumer(pool):
        async with aiohttp.ClientSession() as session:
            while not stopping:
                if len(waitting_urls) == 0:
                    await asyncio.sleep(0.5)
                    continue
                url = waitting_urls.pop()
                print("start get url: {}".format(url))
                if re.match("http://.*?jobbole.com/.*.html", url):
                    if url not in seen_urls:
                        asyncio.ensure_future(article_handler(url, session, pool))
                        await asyncio.sleep(0.5)
                else:
                    if url not in seen_urls:
                        asyncio.ensure_future(init_urls(url, session))
    
    
    async def main(loop):
        pool = await aiomysql.create_pool(host='', port=,
                                          user='', password='',
                                          db='', loop=loop,
                                          charset="utf8", autocommit=True)
        async with aiohttp.ClientSession() as session:
            html = await fetch(start_url, session)
            seen_urls.add(start_url)
            extract_urls(html)
        asyncio.ensure_future(consumer(pool))
    
    
    if __name__ == "__main__":
        loop = asyncio.get_event_loop()
        asyncio.ensure_future(main(loop))
        loop.run_forever()
  • 相关阅读:
    容器级虚拟化如何进行资源分配
    容器虚拟化实现的原理
    tensorflow报cudnn错误
    nginx调优
    mysql主从原理及配置
    新安装mysql,如何提升mysql安全性
    LINUX系统软件安装和卸载的常见方法
    如何增加黑客通过ssh入侵的难度--保护ssh的三把锁
    ubuntu-docker入门到放弃(八)创建支持SSH服务的镜像
    ubuntu-docker入门到放弃(七)操作系统
  • 原文地址:https://www.cnblogs.com/yejing-snake/p/14276613.html
Copyright © 2011-2022 走看看