zoukankan html css js c++ java

aio 爬虫，去重，入库

#aio 爬虫，去重，入库
import asyncio
import aiohttp
import aiomysql
import re
from pyquery import PyQuery

stoping = False
start_url = 'http://www.jobbole.com/'

waiting_urls = []
seen_urls = set()
# url去重 --布隆过滤器 bloom filter

sem  = asyncio.Semaphore(3) #限制并发数量

async def fetch(url,session):
    async with sem:
        #await asyncio.sleep(0.5)
        try:
            async with session.get(url) as resp:
                print(resp.status)
                if resp.status in [200,201]:
                    data = await resp.text()
                    return data
        except Exception as e :
            print(e)


#因为不是耗费 io的 所以用普通函数
def extract_urls(html):
    urls = []
    pq = PyQuery(html)
    for link in pq.items('a'):
        url = link.attr('href')
        if url and url.startswith('http') and url not in seen_urls:
            urls.append(url)
            waiting_urls.append(url)
    return urls


async def init_urls(url,session):
    html = await fetch(url,session)
    seen_urls.add(url)
    extract_urls(html)

async def article_handeler(url,session,pool):
    #获取文章详情，并解析入库
    html = await fetch(url,session)
    seen_urls.add(url)
    extract_urls(html)
    pq = PyQuery(html)
    title = pq('title').text()
    async with pool.acquire() as conn:
        async with conn.cursor() as cur:
            await cur.execute('SELECT 42;')
            insert_sql = 'insert into aiomysql_test(title) VALUES ("{}")'.format(title)
            await cur.execute(insert_sql)


async def consumer(pool):
    async with aiohttp.ClientSession() as session:
        while not stoping:
            if len(waiting_urls) == 0:
                await asyncio.sleep(0.5)
                continue
            url = waiting_urls.pop()
            print('start get url:{}'.format(url))
            if re.match('http://.*?jobbole.com/d+/',url):
                if url not in seen_urls:
                    asyncio.ensure_future(article_handeler(url,session,pool))
                    await asyncio.sleep(0.5)
            else:
                if url not in seen_urls:
                    asyncio.ensure_future(init_urls(url,session))


async def main(loop):
    #等待mysql链接建立好
    pool = await aiomysql.create_pool(host='127.0.0.1',port = 3306,
                                      user = 'root',password='123456',
                                      db = 'aiomysql_test',loop=loop,
                                      charset = 'utf8',autocommit = True)
    async with aiohttp.ClientSession() as session:
        html = await fetch(start_url, session)
        seen_urls.add(start_url)
        extract_urls(html)

    asyncio.ensure_future(consumer(pool))

if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    asyncio.ensure_future(main(loop))
    loop.run_forever()

查看全文

相关阅读:
Codeforces 1062
HDU 1247
力扣 7. 整数反转
 力扣 3. 无重复字符的最长子串
 力扣1. 两数之和
 力扣 78.子集
 C++编译时报错“count”符号不明确
 Java邻接矩阵存储图简易版以及深度优先优先遍历和广度优先遍历
 word中超链接显示成{HYPERLINK "url"}形式的解决方案
 Hadoop在window上运行出现:java.io.IOException: (null) entry in command string: null chmod 0644

原文地址：https://www.cnblogs.com/Erick-L/p/8939607.html

热门文章
POJ 3635
POJ 3635
CH 2601
HDU 3183
POJ 2408
洛谷试炼场
 Gym 101981J
Gym 101981G
Gym 101981I
Gym 101981K