zoukankan html css js c++ java

使用aiohttp实现爬虫

import asyncio
import re

import aiohttp
import aiomysql
from pyquery import PyQuery


stopping = False
start_url = "http://www.jobbole.com/"
waitting_urls = []
seen_urls = set()

sem = asyncio.Semaphore(3)


async def fetch(url, session):
    async with sem:
        await asyncio.sleep(1)
        try:
            async with session.get(url) as response:
                print("Status:", response.status)
                print("Content-type:", response.headers['content-type'])
                if response.status in [200, 201]:
                    content = await response.text()
                    return content
        except Exception as e:
            print(e)


def extract_urls(html):
    pq = PyQuery(html)
    for link in pq.items("a"):
        url = link.attr("href")
        if url and url.startswith("/caijing") and url not in seen_urls:
            waitting_urls.append("http://www.jobbole.com{}".format(url))


async def init_urls(url, session):
    html = await fetch(url, session)
    seen_urls.add(url)
    extract_urls(html)


async def article_handler(url, session, pool):
    # 获取文章详情并解析入库
    html = await fetch(url, session)
    seen_urls.add(url)
    extract_urls(html)
    pq = PyQuery(html)
    title = pq("title").text()
    async with pool.acquire() as conn:
        async with conn.cursor() as cur:
            insert_sql = "insert into artest(title) values ('{}')".format(title)
            await cur.execute(insert_sql)


async def consumer(pool):
    async with aiohttp.ClientSession() as session:
        while not stopping:
            if len(waitting_urls) == 0:
                await asyncio.sleep(0.5)
                continue
            url = waitting_urls.pop()
            print("start get url: {}".format(url))
            if re.match("http://.*?jobbole.com/.*.html", url):
                if url not in seen_urls:
                    asyncio.ensure_future(article_handler(url, session, pool))
                    await asyncio.sleep(0.5)
            else:
                if url not in seen_urls:
                    asyncio.ensure_future(init_urls(url, session))


async def main(loop):
    pool = await aiomysql.create_pool(host='', port=,
                                      user='', password='',
                                      db='', loop=loop,
                                      charset="utf8", autocommit=True)
    async with aiohttp.ClientSession() as session:
        html = await fetch(start_url, session)
        seen_urls.add(start_url)
        extract_urls(html)
    asyncio.ensure_future(consumer(pool))


if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    asyncio.ensure_future(main(loop))
    loop.run_forever()

查看全文

相关阅读:
容器级虚拟化如何进行资源分配
 容器虚拟化实现的原理
 tensorflow报cudnn错误
 nginx调优
 mysql主从原理及配置
 新安装mysql，如何提升mysql安全性
 LINUX系统软件安装和卸载的常见方法
 如何增加黑客通过ssh入侵的难度--保护ssh的三把锁
 ubuntu-docker入门到放弃（八）创建支持SSH服务的镜像
 ubuntu-docker入门到放弃（七）操作系统

原文地址：https://www.cnblogs.com/yejing-snake/p/14276613.html