zoukankan      html  css  js  c++  java
  • 布隆过滤的代码

    # -*- coding: utf-8 -*-
    import redis
    from hashlib import md5
    
    class SimpleHash(object):
        def __init__(self, cap, seed):
            self.cap = cap
            self.seed = seed
    
        def hash(self, value):
            ret = 0
            for i in range(len(value)):
                ret += self.seed * ret + ord(value[i])
            return (self.cap - 1) & ret
    
    
    class BloomFilter(object):
        def __init__(self, host='localhost', port=6379, db=0,password=None, blockNum=1, key='bloomfilter'):
            """
            :param host: the host of Redis
            :param port: the port of Redis
            :param db: witch db in Redis
            :param blockNum: one blockNum for about 90,000,000; if you have more strings for filtering, increase it.
            :param key: the key's name in Redis
            """
            self.server = redis.Redis(host=host, port=port, db=db,password=password)
            self.bit_size = 1 << 31  # Redis的String类型最大容量为512M,现使用256M
            self.seeds = [5, 7, 11, 13, 31, 37, 61]
            self.key = key
            self.blockNum = blockNum
            self.hashfunc = []
            for seed in self.seeds:
                self.hashfunc.append(SimpleHash(self.bit_size, seed))
    
        def isContains(self, str_input):
            if not str_input:
                return False
            m5 = md5()
            m5.update(str_input.encode('utf-8'))
            str_input = m5.hexdigest()
            ret = True
            name = self.key + str(int(str_input[0:2], 16) % self.blockNum)
            for f in self.hashfunc:
                loc = f.hash(str_input)
                ret = ret & self.server.getbit(name, loc)
            return ret
    
        def insert(self, str_input):
            m5 = md5()
            m5.update(str_input.encode('utf-8'))
            str_input = m5.hexdigest()
            name = self.key + str(int(str_input[0:2], 16) % self.blockNum)
            for f in self.hashfunc:
                loc = f.hash(str_input)
                self.server.setbit(name, loc, 1)
    
    
    if __name__ == '__main__':
    
        bf = BloomFilter()
        if bf.isContains('http://www.baidu.com'):  # 判断字符串是否存在
            print('exists!')
        else:
            print('not exists!')#如果不存则加入进去
            bf.insert('http://www.baidu.com')
    以上内容作为课堂笔记,如有雷同,请联系于我
  • 相关阅读:
    不在models.py中的models
    Python多进程编程
    Python多线程编程
    Linux系统的数据写入机制--延迟写入
    Python读写文件你真的了解吗?
    面试 Linux 运维工作至少需要知道哪些知识?
    查找占用资源高的JAVA代码
    CPU的load和使用率傻傻分不清
    Python编写守护进程程序
    由Nginx的DNS缓存导致的访问404
  • 原文地址:https://www.cnblogs.com/ArtisticMonk/p/10255658.html
Copyright © 2011-2022 走看看