zoukankan      html  css  js  c++  java
  • mongodb数据库 存爬虫数据

    #在进行操作之前,先把mongodb数据库启动起来,新建一个mongo_cache.py文件
    import pickle
    import zlib
    from datetime import datetime,timedelta
    
    import requests
    from pymongo import MongoClient
    from bson.binary import Binary
    
    class MongoCache(object):
        """
        数据库缓存
        """
        def __init__(self,client=None,expires=timedelta(days=30)):
            self.client = MongoClient("localhost",27017)
            self.db = self.client.cache
            ##加速查找设置索引,设置超时时间,如果达到expi reAfterSeconds设置的超时时间,mongodb会自动删除超时数据
            self.db.webpage.create_index('timestamp',expireAfterSeconds=expires.total_seconds())
    
    
        def __setitem__(self, key, value):
            # 压缩数据设置时间戳
            record = {"result":Binary(zlib.compress(pickle.dumps(value))),"timestamp":datetime.utcnow()}
            ##使用update的upsert (如果不存在执行insert,存在update)参数迸行插入更新操作,$set内置函数表示覆盖原始数据
            self.db.webpage.update({"_id":key},{'$set':record},upsert=True)
    
        def __getitem__(self, item):
            #根据_id以item作为关键字,查找相关网页
            record = self.db.webpage.find_one({"_id":item})
            if record:
                #如果存在进行解压缩反序列化
                return pickle.loads(zlib.decompress(record["result"]))
            else:
                raise KeyError(item + "does not exist")#找不到抛出异常
    
        def __contains__(self, item):
            try:
                self[item]#执行__getitem__方法
            except KeyError:
                return False#捕获到keyerror异常
            else:
                return True#找到相应数据说明说句酷白喊下载内容
    
        def clear(self):
            self.db.webpage.drop()
    
    if __name__ == '__main__':
        mongo_cache = MongoCache()
        url = 'http://tieba.baidu.com/f?kw=猫&red_tag=1'
        response = requests.get(url)
        mongo_cache[url] = response.text
        print(mongo_cache[url])
    #在建一个文件
    import requests
    
    import mongo_cache
    
    download_url = "http://tieba.baidu.com/f?kw=猫&red_tag=2"
    download_response = requests.get(download_url)
    m_cache = mongo_cache. MongoCache()
    m_cache [download_url] = download_response.content
    print (m_cache [download_url]. decode('utf-8'))
    print(download_url in m_cache)
    

      

  • 相关阅读:
    mac 打开文件路径
    js 小技巧
    java 随机数
    sql server 2000 按日期查找
    WML
    Groovy
    Windows Azure Traffic Manager (4) Windows Azure Traffic Manager (4) 循环法和故障转移
    Windows Azure Cloud Service (28) 在Windows Azure发送邮件(中)
    [New Portal] Windows Azure Cloud Service (30) 新的Windows Azure SDK 1.7和新的Windows Azure Managemeng Portal
    Windows Azure Traffic Manager (3) Windows Azure Traffic Manager (3) 创建流量管理器策略和性能负载平衡
  • 原文地址:https://www.cnblogs.com/liangliangzz/p/10142341.html
Copyright © 2011-2022 走看看