zoukankan      html  css  js  c++  java
  • asyncio异步采集小试一下 ,果然快!

    # _*_ coding: utf-8 _*_
    import codecs
    from bs4 import BeautifulSoup
    import time, json, math
    import sys, os
    import asyncio
    import aiohttp
    import aiofiles
    
    f = codecs.open('goods.txt', 'w', encoding='utf-8', errors='ignore')
    semaphore = asyncio.Semaphore(5)
    
    #asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
    
    async def getHtml(url):
        async with semaphore:
            async with aiohttp.ClientSession() as session:
                async with session.get(url) as html:
                    if url.endswith('.jpg'):
                        img = await html.read()
                        imgname = url.replace('http://www.13qh.com/', '')
                        imgpath = os.path.dirname(imgname)
                        if not os.path.exists(imgpath):
                            os.makedirs(imgpath)
                        fp  = await aiofiles.open(imgname, 'wb')
                        await fp.write(img)
                        return True
                    else:
                        tmp = await html.text(encoding='utf-8')
                        return tmp
    
    async def getList(url, **cat):
        tmp = await getHtml(url)
        try:
            htm = BeautifulSoup(tmp, 'lxml')
            ul  = htm.select('.goods-item .goods-pic a')
        except Exception as e:
            print(e)
            ul  = None
        if ul  != None:
            for li in ul:
                link = li.get('href')
                await parse(link, **cat)
    
    async def parse(url, **cat):
        tmp = await getHtml(url)
        try:
            htm = BeautifulSoup(tmp, 'lxml')
            goods_id = url.split('/')[-1]
            goods_name = htm.select('.goods-title h3')[0].text
            goods_name_sub = htm.select('.goods-title p')[0].text
            goods_price = htm.select('.goods-info .sale_price')[0].text
            sale_price  = htm.select('.goods-info ul li')[0].find('del').text
            sale_price  = filter(lambda ch : ch in '.0123456789', sale_price)
            thumb_cont  = htm.select('.thumb-cont ul li')
            print(goods_name)
            goods_thumb = []
            for thumb in thumb_cont:
                img = thumb.find('img').get('big')
                goods_thumb.append(img)
                print(img)
                await getHtml('http://www.13qh.com' + img)
            detail_div  = htm.select('.detail-content p img')
            goods_detail = []
            for p in detail_div:
                goods_detail.append(p.get('src'))
                print(p.get('src'))
                await getHtml('http://www.13qh.com' + p.get('src'))
            goods = {
                'cat_id': cat['lan_id'],
                'sub_id': cat['sub_id'],
                'goods_id': goods_id,
                'goods_name': goods_name,
                'goods_price': goods_price,
                'sale_price' : sale_price,
                'goods_thumb': goods_thumb,
                'goods_detail': goods_detail
            }
            f.write(json.dumps(goods) + os.linesep)
        except Exception as e:
            print(e)
    
    async def caiz():
        url = 'http://www.13qh.com/'
        tmp = await getHtml(url)
        htm = BeautifulSoup(tmp, 'lxml')
        cat = htm.select('.category-content>ul>li')
    
        category = []
        for li in cat:
            lan = li.select('p a')[0]
            lan_text = lan.text
            lan_id   = lan.get('href').split('/')[-1]
            category.append({'cat_id': lan_id, 'cat_name': lan_text, 'parent_id': 0})
    
            ul  = li.select('.category-list ul li')
            for u in ul:
                ua = u.select('.a')
                for a in ua:
                    sua = a.select('a')
                    sua_text = sua.text
                    sua_id   = sua.get('href').split('/')[-1]
                    category.append({'cat_id': sua_id, 'cat_name': sua_text, 'parent_id': lan_id})
    
                ub = u.select('.b a')[0]
                sub_text = ub.text
                sub_id   = ub.get('href').split('/')[-1]
                category.append({'cat_id': sub_id, 'cat_name': sub_text, 'parent_id': lan_id})
    
                uc = u.select('.c a')
                for c in uc:
                    suc_text = c.text
                    suc_href = c.get('href')
                    suc_id   = suc_href.split('/')[-1]
                    category.append({'cat_id': suc_id, 'cat_name': suc_text, 'parent_id': sub_id})
    
                    for i in range(1, 20):
                        asyncio.ensure_future(getList("%s/page/%s" % (suc_href, i), lan_id = lan_id, sub_id = sub_id))
        with codecs.open('category.txt', 'w', encoding='utf-8', errors='ignore') as ff:
            ff.write(json.dumps(category))
    
    def main():
        loop  = asyncio.get_event_loop()
        asyncio.run(caiz())
        f.close()
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    解决org.apache.ibatis.binding.BindingException: Invalid bound statement (not found): com.xyfer.dao.UserDao.findById
    Oracle使用MyBatis中RowBounds实现分页查询
    普元EOS开发经验总结——不定期持续更新中
    Vue数据列表倒计时展示
    Java后端学习路线
    Linux下命令行安装WebLogic 10.3.6
    Oracle快速运行一指禅
    maven学习知识点汇总
    EOS下控制台以及图形界面打印sql语句
    Myeclipse使用过程配置汇总
  • 原文地址:https://www.cnblogs.com/6min/p/14078916.html
Copyright © 2011-2022 走看看