zoukankan      html  css  js  c++  java
  • 链家网 + gevent + MongoDB

    写入文档

    import gevent
    from gevent import monkey
    monkey.patch_all()
    from gevent.queue import Queue
    import time
    import os
    import requests
    import re
    
    start = time.perf_counter()
    work = Queue()
    # for i in range(1,101):
    #     url = f'https://sh.fang.lianjia.com/loupan/pg{i}/'
    #     print(url)
    #     work.put_nowait(url)
    url = 'https://sh.fang.lianjia.com/loupan/pg{}/'
    url_list = (url.format(i) for i in range(1,101))
    [work.put_nowait(url) for url in url_list]
    
    info_set = set()
    
    def spider():
        while not work.empty():
            url = work.get_nowait()
            res = requests.get(url).text
            title = re.findall('<a href="/loup.*?itle="(.*?)"'
                               '.*?<div class="resb.*?<span>(.*?)</span>'
                               '.*?<span>(.*?)</span>'
                               '.*?<span class="number">(.*?)</span>'
                               '.*?<span class="desc">&nbsp;(.*?)</span>',res,re.S)
    
            for i in title:
                info_set.add(i)
    
    tasks = []
    
    for x in range(200):
        task = gevent.spawn(spider)
        tasks.append(task)
    gevent.joinall(tasks,timeout=6)
    
    
    for i,n in enumerate(info_set):
    
        title = f'标题:  {n[0]}'
        addr = f'地区:  {n[1]}{n[2]}'
        price = f'价格:  {n[3]}{n[4]}'
        print(f"""
        {i}
        {title}
        {addr}
        {price}
        """)
    
        with open('./lianjie.cvs','a',encoding='utf-8') as f:
            f.writelines([title,addr,price,'
    '])
            print('写入完成')
    
    print(time.perf_counter()-start)
    

    MongoDB 批量插入

    当爬虫获取的数据量较大时,一条一条的写入MongoDB会过于消耗资源。

    这时候就需要用到insert_many()方法,把数据放入列表中并批量插入,但是如果你为了数据不被重复建立了MongoDB的唯一索引,就可能会导致部分数据无法写入。

    因为insert_many()默认是按序写入,一条数据写入失败,后面的数据就无法写入了。所以需要修改默认参数ordered

    ordered=False时,数据就会被乱序并行插入,所以每个字典的插入操作互不影响。

    import gevent
    from gevent import monkey
    monkey.patch_all()
    from gevent.queue import Queue
    import time
    import os
    import requests
    import re
    from pymongo import MongoClient
    
    client = MongoClient('localhost', 27017)
    db = client['ljw']
    db = db.lj
    
    start = time.perf_counter()
    work = Queue()
    
    url = 'https://sh.fang.lianjia.com/loupan/pg{}/'
    url_list = (url.format(i) for i in range(1,101))
    [work.put_nowait(url) for url in url_list]
    
    info_set = set()
    
    def spider():
        while not work.empty():
            url = work.get_nowait()
            res = requests.get(url).text
            title = re.findall('<a href="/loup.*?itle="(.*?)"'
                               '.*?<div class="resb.*?<span>(.*?)</span>'
                               '.*?<span>(.*?)</span>'
                               '.*?<span class="number">(.*?)</span>'
                               '.*?<span class="desc">&nbsp;(.*?)</span>',res,re.S)
    
            for i in title:
                info_set.add(i)
    
    tasks = []
    
    for x in range(200):
        task = gevent.spawn(spider)
        tasks.append(task)
    gevent.joinall(tasks,timeout=6)
    
    info_list = []
    
    for n in info_set:
    
        title = f'{n[0]}'
        addr = f'{n[1]}{n[2]}'
        price = f'{n[3]}{n[4]}'
    
        items = {f'{title}':[ addr, price]}
        info_list.append(items)
    
    
    try:
        db.insert_many(info_list, ordered=False)
    except:
        pass
    print(time.perf_counter()-start)
    
    
    
    
  • 相关阅读:
    sql 行列转换之关键字pivot,unpivot
    构建动态表达式(初级)
    HttpApplication事件执行顺序
    【转】delegate.BeginInvoke 注意事项
    Jquery.extend函数详解【转】
    【转】NET中反射实现 可空类型 与基础类型的转换 以及获取指定属性的大小问题
    sql执行字符串
    Javascript获取浏览器地址栏url各项值
    你妹的浏览器DNS缓存
    LazyLoad 延迟加载图片的jQuery插件介绍
  • 原文地址:https://www.cnblogs.com/kai-/p/12795638.html
Copyright © 2011-2022 走看看