zoukankan      html  css  js  c++  java
  • 相似哈希simhash

    #!/usr/bin/env python
    # -*- coding=utf-8 -*-
     
    # Implementation of Charikar simhashes in Python
    # See: http://dsrg.mff.cuni.cz/~holub/sw/shash/#a1
     
    class simhash():
        def __init__(self, tokens='', hashbits=128):
            self.hashbits = hashbits
            self.hash = self.simhash(tokens)
     
        def __str__(self):
            return str(self.hash)
     
        def __long__(self):
            return long(self.hash)
     
        def __float__(self):
            return float(self.hash)
     
        def simhash(self, tokens):
            # Returns a Charikar simhash with appropriate bitlength
            v = [0]*self.hashbits
     
            for t in [self._string_hash(x) for x in tokens]:
                bitmask = 0
                #print (t)
                for i in range(self.hashbits):
                    bitmask = 1 << i
                    #print(t,bitmask, t & bitmask)
                    if t & bitmask:
                        v[i] += 1 #查看当前bit位是否为1,是的话则将该位+1
                    else:
                        v[i] += -1 #否则得话,该位减1
     
            fingerprint = 0
            for i in range(self.hashbits):
                if v[i] >= 0:
                    fingerprint += 1 << i
    #整个文档的fingerprint为最终各个位大于等于0的位的和
            return fingerprint
     
        def _string_hash(self, v):
            # A variable-length version of Python's builtin hash
            if v == "":
                return 0
            else:
                x = ord(v[0])<<7
                m = 1000003
                mask = 2**self.hashbits-1
                for c in v:
                    x = ((x*m)^ord(c)) & mask
                x ^= len(v)
                if x == -1:
                    x = -2
                return x
     
        def hamming_distance(self, other_hash):
            x = (self.hash ^ other_hash.hash) & ((1 << self.hashbits) - 1)
            tot = 0
            while x:
                tot += 1
                x &= x-1
            return tot
     
        def similarity(self, other_hash):
            a = float(self.hash)
            b = float(other_hash)
            if a>b: return b/a
            return a/b
     
    if __name__ == '__main__':
        #看看哪些东西google最看重?标点?
        s = '看看哪些东西google最看重?标点?'
        hash1 =simhash(s.split())
        #print("0x%x" % hash1)
        #print ("%s\t0x%x" % (s, hash1))
     
        s = '看看哪些东西google最看重!标点!'
        hash2 = simhash(s.split())
        #print ("%s\t[simhash = 0x%x]" % (s, hash2))
     
        print '%f%% percent similarity on hash' %(100*(hash1.similarity(hash2)))
        print hash1.hamming_distance(hash2),"bits differ out of", hash1.hashbits
    

      

  • 相关阅读:
    我的2018:OCR、实习和秋招
    【OCR技术系列之六】文本检测CTPN的代码实现
    【OCR技术系列之五】自然场景文本检测技术综述(CTPN, SegLink, EAST)
    如何免费使用谷歌搜索
    CUDA编程之快速入门
    我在北京实习的四个月
    在C++98基础上学习C++11新特性
    Linux编程之线程池的设计与实现(C++98)
    ASP.NET Core中使用IOC三部曲(三.采用替换后的Autofac来实现AOP拦截)
    ASP.NET Core文件上传与下载(多种上传方式)
  • 原文地址:https://www.cnblogs.com/imouren/p/2850093.html
Copyright © 2011-2022 走看看