zoukankan      html  css  js  c++  java
  • url去重 --布隆过滤器 bloom filter原理及python实现

    https://blog.csdn.net/a1368783069/article/details/52137417

    # -*- encoding: utf-8 -*-
    """This module implements a bloom filter probabilistic data structure and
    an a Scalable Bloom Filter that grows in size as your add more items to it
    without increasing the false positive error_rate.
    
    Requires the bitarray library: http://pypi.python.org/pypi/bitarray/
    
        >>> from pybloom import BloomFilter
        >>> f = BloomFilter(capacity=10000, error_rate=0.001)
        >>> for i in range(0, f.capacity):
        ...     _ = f.add(i)
        ...
        >>> 0 in f
        True
        >>> f.capacity in f
        False
        >>> len(f) <= f.capacity
        True
        >>> abs((len(f) / float(f.capacity)) - 1.0) <= f.error_rate
        True
    
        >>> from pybloom import ScalableBloomFilter
        >>> sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
        >>> count = 10000
        >>> for i in range(0, count):
        ...     _ = sbf.add(i)
        ...
        >>> sbf.capacity > count
        True
        >>> len(sbf) <= count
        True
        >>> abs((len(sbf) / float(count)) - 1.0) <= sbf.error_rate
        True
    
    """
    import math
    import hashlib
    from struct import unpack, pack, calcsize
    
    try:
        import bitarray
    except ImportError:
        raise ImportError('pybloom requires bitarray >= 0.3.4')
    
    __version__ = '1.1'
    __author__  = "Jay Baird <jay@mochimedia.com>, Bob Ippolito <bob@redivi.com>,
                   Marius Eriksen <marius@monkey.org>,
                   Alex Brasetvik <alex@brasetvik.com>"
    
    def make_hashfuncs(num_slices, num_bits):
        if num_bits >= (1 << 31):
            fmt_code, chunk_size = 'Q', 8
        elif num_bits >= (1 << 15):
            fmt_code, chunk_size = 'I', 4
        else:
            fmt_code, chunk_size = 'H', 2
        total_hash_bits = 8 * num_slices * chunk_size
        if total_hash_bits > 384:
            hashfn = hashlib.sha512
        elif total_hash_bits > 256:
            hashfn = hashlib.sha384
        elif total_hash_bits > 160:
            hashfn = hashlib.sha256
        elif total_hash_bits > 128:
            hashfn = hashlib.sha1
        else:
            hashfn = hashlib.md5
        fmt = fmt_code * (hashfn().digest_size // chunk_size)
        num_salts, extra = divmod(num_slices, len(fmt))
        if extra:
            num_salts += 1
        salts = [hashfn(hashfn(pack('I', i)).digest()) for i in range(num_salts)]
        def _make_hashfuncs(key):
            #if isinstance(key, unicode):
            #    key = key.encode('utf-8')
            #else:
            #    key = str(key)
            key = str(key).encode("utf-8")
            rval = []
            for salt in salts:
                h = salt.copy()
                h.update(key)
                rval.extend(uint % num_bits for uint in unpack(fmt, h.digest()))
            del rval[num_slices:]
            return rval
        return _make_hashfuncs
    
    
    class BloomFilter(object):
        FILE_FMT = '<dQQQQ'
    
        def __init__(self, capacity, error_rate=0.001):
            """Implements a space-efficient probabilistic data structure
    
            capacity
                this BloomFilter must be able to store at least *capacity* elements
                while maintaining no more than *error_rate* chance of false
                positives
            error_rate
                the error_rate of the filter returning false positives. This
                determines the filters capacity. Inserting more than capacity
                elements greatly increases the chance of false positives.
    
            >>> b = BloomFilter(capacity=100000, error_rate=0.001)
            >>> b.add("test")
            False
            >>> "test" in b
            True
    
            """
            if not (0 < error_rate < 1):
                raise ValueError("Error_Rate must be between 0 and 1.")
            if not capacity > 0:
                raise ValueError("Capacity must be > 0")
            # given M = num_bits, k = num_slices, p = error_rate, n = capacity
            # solving for m = bits_per_slice
            # n ~= M * ((ln(2) ** 2) / abs(ln(P)))
            # n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P)))
            # m ~= n * abs(ln(P)) / (k * (ln(2) ** 2))
            num_slices = int(math.ceil(math.log(1 / error_rate, 2)))
            # the error_rate constraint assumes a fill rate of 1/2
            # so we double the capacity to simplify the API
            bits_per_slice = int(math.ceil(
                (2 * capacity * abs(math.log(error_rate))) /
                (num_slices * (math.log(2) ** 2))))
            self._setup(error_rate, num_slices, bits_per_slice, capacity, 0)
            self.bitarray = bitarray.bitarray(self.num_bits, endian='little')
            self.bitarray.setall(False)
    
        def _setup(self, error_rate, num_slices, bits_per_slice, capacity, count):
            self.error_rate = error_rate
            self.num_slices = num_slices
            self.bits_per_slice = bits_per_slice
            self.capacity = capacity
            self.num_bits = num_slices * bits_per_slice
            self.count = count
            self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice)
    
        def __contains__(self, key):
            """Tests a key's membership in this bloom filter.
    
            >>> b = BloomFilter(capacity=100)
            >>> b.add("hello")
            False
            >>> "hello" in b
            True
    
            """
            bits_per_slice = self.bits_per_slice
            bitarray = self.bitarray
            if not isinstance(key, list):
                hashes = self.make_hashes(key)
            else:
                hashes = key
            offset = 0
            for k in hashes:
                if not bitarray[offset + k]:
                    return False
                offset += bits_per_slice
            return True
    
        def __len__(self):
            """Return the number of keys stored by this bloom filter."""
            return self.count
    
        def add(self, key, skip_check=False):
            """ Adds a key to this bloom filter. If the key already exists in this
            filter it will return True. Otherwise False.
    
            >>> b = BloomFilter(capacity=100)
            >>> b.add("hello")
            False
            >>> b.add("hello")
            True
    
            """
            bitarray = self.bitarray
            bits_per_slice = self.bits_per_slice
            hashes = self.make_hashes(key)
            if not skip_check and hashes in self:
                return True
            if self.count > self.capacity:
                raise IndexError("BloomFilter is at capacity")
            offset = 0
            for k in hashes:
                self.bitarray[offset + k] = True
                offset += bits_per_slice
            self.count += 1
            return False
    
        def copy(self):
            """Return a copy of this bloom filter.
            """
            new_filter = BloomFilter(self.capacity, self.error_rate)
            new_filter.bitarray = self.bitarray.copy()
            return new_filter
    
        def union(self, other):
            """ Calculates the union of the two underlying bitarrays and returns
            a new bloom filter object."""
            if self.capacity != other.capacity or 
                self.error_rate != other.error_rate:
                raise ValueError("Unioning filters requires both filters to have 
    both the same capacity and error rate")
            new_bloom = self.copy()
            new_bloom.bitarray = new_bloom.bitarray | other.bitarray
            return new_bloom
    
        def __or__(self, other):
            return self.union(other)
    
        def intersection(self, other):
            """ Calculates the union of the two underlying bitarrays and returns
            a new bloom filter object."""
            if self.capacity != other.capacity or 
                self.error_rate != other.error_rate:
                raise ValueError("Intersecting filters requires both filters to 
    have equal capacity and error rate")
            new_bloom = self.copy()
            new_bloom.bitarray = new_bloom.bitarray & other.bitarray
            return new_bloom
    
        def __and__(self, other):
            return self.intersection(other)
    
        def tofile(self, f):
            """Write the bloom filter to file object `f'. Underlying bits
            are written as machine values. This is much more space
            efficient than pickling the object."""
            f.write(pack(self.FILE_FMT, self.error_rate, self.num_slices,
                         self.bits_per_slice, self.capacity, self.count))
            self.bitarray.tofile(f)
    
        @classmethod
        def fromfile(cls, f, n=-1):
            """Read a bloom filter from file-object `f' serialized with
            ``BloomFilter.tofile''. If `n' > 0 read only so many bytes."""
            headerlen = calcsize(cls.FILE_FMT)
    
            if 0 < n < headerlen:
                raise ValueError('n too small!')
    
            filter = cls(1)  # Bogus instantiation, we will `_setup'.
            filter._setup(*unpack(cls.FILE_FMT, f.read(headerlen)))
            filter.bitarray = bitarray.bitarray(endian='little')
            if n > 0:
                filter.bitarray.fromfile(f, n - headerlen)
            else:
                filter.bitarray.fromfile(f)
            if filter.num_bits != filter.bitarray.length() and 
                   (filter.num_bits + (8 - filter.num_bits % 8)
                    != filter.bitarray.length()):
                raise ValueError('Bit length mismatch!')
    
            return filter
    
        def __getstate__(self):
            d = self.__dict__.copy()
            del d['make_hashes']
            return d
    
        def __setstate__(self, d):
            self.__dict__.update(d)
            self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice)
    
    class ScalableBloomFilter(object):
        SMALL_SET_GROWTH = 2 # slower, but takes up less memory
        LARGE_SET_GROWTH = 4 # faster, but takes up more memory faster
        FILE_FMT = '<idQd'
    
        def __init__(self, initial_capacity=100, error_rate=0.001,
                     mode=SMALL_SET_GROWTH):
            """Implements a space-efficient probabilistic data structure that
            grows as more items are added while maintaining a steady false
            positive rate
    
            initial_capacity
                the initial capacity of the filter
            error_rate
                the error_rate of the filter returning false positives. This
                determines the filters capacity. Going over capacity greatly
                increases the chance of false positives.
            mode
                can be either ScalableBloomFilter.SMALL_SET_GROWTH or
                ScalableBloomFilter.LARGE_SET_GROWTH. SMALL_SET_GROWTH is slower
                but uses less memory. LARGE_SET_GROWTH is faster but consumes
                memory faster.
    
            >>> b = ScalableBloomFilter(initial_capacity=512, error_rate=0.001, 
                                        mode=ScalableBloomFilter.SMALL_SET_GROWTH)
            >>> b.add("test")
            False
            >>> "test" in b
            True
            >>> unicode_string = u'¡'
            >>> b.add(unicode_string)
            False
            >>> unicode_string in b
            True
            """
            if not error_rate or error_rate < 0:
                raise ValueError("Error_Rate must be a decimal less than 0.")
            self._setup(mode, 0.9, initial_capacity, error_rate)
            self.filters = []
    
        def _setup(self, mode, ratio, initial_capacity, error_rate):
            self.scale = mode
            self.ratio = ratio
            self.initial_capacity = initial_capacity
            self.error_rate = error_rate
    
        def __contains__(self, key):
            """Tests a key's membership in this bloom filter.
    
            >>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, 
                                        mode=ScalableBloomFilter.SMALL_SET_GROWTH)
            >>> b.add("hello")
            False
            >>> "hello" in b
            True
    
            """
            for f in reversed(self.filters):
                if key in f:
                    return True
            return False
    
        def add(self, key):
            """Adds a key to this bloom filter.
            If the key already exists in this filter it will return True.
            Otherwise False.
    
            >>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, 
                                        mode=ScalableBloomFilter.SMALL_SET_GROWTH)
            >>> b.add("hello")
            False
            >>> b.add("hello")
            True
    
            """
            if key in self:
                return True
            filter = self.filters[-1] if self.filters else None
            if filter is None or filter.count >= filter.capacity:
                num_filters = len(self.filters)
                filter = BloomFilter(
                    capacity=self.initial_capacity * (self.scale ** num_filters),
                    error_rate=self.error_rate * (self.ratio ** num_filters))
                self.filters.append(filter)
            filter.add(key, skip_check=True)
            return False
    
        @property
        def capacity(self):
            """Returns the total capacity for all filters in this SBF"""
            return sum([f.capacity for f in self.filters])
    
        @property
        def count(self):
            return len(self)
    
        def tofile(self, f):
            """Serialize this ScalableBloomFilter into the file-object
            `f'."""
            f.write(pack(self.FILE_FMT, self.scale, self.ratio,
                         self.initial_capacity, self.error_rate))
    
            # Write #-of-filters
            f.write(pack('<l', len(self.filters)))
    
            if len(self.filters) > 0:
                # Then each filter directly, with a header describing
                # their lengths.
                headerpos = f.tell()
                headerfmt = '<' + 'Q'*(len(self.filters))
                f.write('.' * calcsize(headerfmt))
                filter_sizes = []
                for filter in self.filters:
                    begin = f.tell()
                    filter.tofile(f)
                    filter_sizes.append(f.tell() - begin)
    
                f.seek(headerpos)
                f.write(pack(headerfmt, *filter_sizes))
    
        @classmethod
        def fromfile(cls, f):
            """Deserialize the ScalableBloomFilter in file object `f'."""
            filter = cls()
            filter._setup(*unpack(cls.FILE_FMT, f.read(calcsize(cls.FILE_FMT))))
            nfilters, = unpack('<l', f.read(calcsize('<l')))
            if nfilters > 0:
                header_fmt = '<' + 'Q'*nfilters
                bytes = f.read(calcsize(header_fmt))
                filter_lengths = unpack(header_fmt, bytes)
                for fl in filter_lengths:
                    filter.filters.append(BloomFilter.fromfile(f, fl))
            else:
                filter.filters = []
    
            return filter
    
        def __len__(self):
            """Returns the total number of elements stored in this SBF"""
            return sum([f.count for f in self.filters])
    
    
    if __name__ == "__main__":
        import doctest
        doctest.testmod()
    

      

  • 相关阅读:
    手把手教你利用create-nuxt-app脚手架创建NuxtJS应用
    初识NuxtJS
    webpack打包Vue应用程序流程
    用选择器代替表格列的筛选功能
    Element-UI
    Spectral Bounds for Sparse PCA: Exact and Greedy Algorithms[贪婪算法选特征]
    Sparse Principal Component Analysis via Rotation and Truncation
    Generalized Power Method for Sparse Principal Component Analysis
    Sparse Principal Component Analysis via Regularized Low Rank Matrix Approximation(Adjusted Variance)
    Truncated Power Method for Sparse Eigenvalue Problems
  • 原文地址:https://www.cnblogs.com/pythonClub/p/10116516.html
Copyright © 2011-2022 走看看