zoukankan      html  css  js  c++  java
  • aio 爬虫,去重,入库

    #aio 爬虫,去重,入库
    import asyncio
    import aiohttp
    import aiomysql
    import re
    from pyquery import PyQuery
    
    stoping = False
    start_url = 'http://www.jobbole.com/'
    
    waiting_urls = []
    seen_urls = set()
    # url去重 --布隆过滤器 bloom filter
    
    sem  = asyncio.Semaphore(3) #限制并发数量
    
    async def fetch(url,session):
        async with sem:
            #await asyncio.sleep(0.5)
            try:
                async with session.get(url) as resp:
                    print(resp.status)
                    if resp.status in [200,201]:
                        data = await resp.text()
                        return data
            except Exception as e :
                print(e)
    
    
    #因为不是耗费 io的 所以用普通函数
    def extract_urls(html):
        urls = []
        pq = PyQuery(html)
        for link in pq.items('a'):
            url = link.attr('href')
            if url and url.startswith('http') and url not in seen_urls:
                urls.append(url)
                waiting_urls.append(url)
        return urls
    
    
    async def init_urls(url,session):
        html = await fetch(url,session)
        seen_urls.add(url)
        extract_urls(html)
    
    async def article_handeler(url,session,pool):
        #获取文章详情,并解析入库
        html = await fetch(url,session)
        seen_urls.add(url)
        extract_urls(html)
        pq = PyQuery(html)
        title = pq('title').text()
        async with pool.acquire() as conn:
            async with conn.cursor() as cur:
                await cur.execute('SELECT 42;')
                insert_sql = 'insert into aiomysql_test(title) VALUES ("{}")'.format(title)
                await cur.execute(insert_sql)
    
    
    async def consumer(pool):
        async with aiohttp.ClientSession() as session:
            while not stoping:
                if len(waiting_urls) == 0:
                    await asyncio.sleep(0.5)
                    continue
                url = waiting_urls.pop()
                print('start get url:{}'.format(url))
                if re.match('http://.*?jobbole.com/d+/',url):
                    if url not in seen_urls:
                        asyncio.ensure_future(article_handeler(url,session,pool))
                        await asyncio.sleep(0.5)
                else:
                    if url not in seen_urls:
                        asyncio.ensure_future(init_urls(url,session))
    
    
    async def main(loop):
        #等待mysql链接建立好
        pool = await aiomysql.create_pool(host='127.0.0.1',port = 3306,
                                          user = 'root',password='123456',
                                          db = 'aiomysql_test',loop=loop,
                                          charset = 'utf8',autocommit = True)
        async with aiohttp.ClientSession() as session:
            html = await fetch(start_url, session)
            seen_urls.add(start_url)
            extract_urls(html)
    
        asyncio.ensure_future(consumer(pool))
    
    if __name__ == "__main__":
        loop = asyncio.get_event_loop()
        asyncio.ensure_future(main(loop))
        loop.run_forever()
  • 相关阅读:
    RvmTranslator7.4.1-Clipping Box
    使用K-means和高斯混合模型对图像进行聚类
    Python小技巧
    利用SNAP软件进行Sentinel-1A卫星微波影像的预处理
    VScode编译C,头文件显示not found的解决方法
    深浅拷贝
    CSRF攻击:陌生链接不要随便点
    跨站脚本攻击(XSS)
    同源策略:为什么XMLHttpRequest不能跨域请求资源?
    HTTP/2:如何提升网络速度
  • 原文地址:https://www.cnblogs.com/Erick-L/p/8939607.html
Copyright © 2011-2022 走看看