zoukankan      html  css  js  c++  java
  • 利用simhash计算文本相似度

    摘自:http://www.programcreek.com/java-api-examples/index.php?source_dir=textmining-master/src/com/gta/simhash/SimHash.java

    package com.gta.simhash;
     
    public class Test { 
     
     public static void main(String[] args) { 
      // TODO Auto-generated method stub 
           
      String s3 = "�������Ϻ���������죬���������������������Ͼ������ݣ����������ţ����ϣ��ൺ���人�����ݣ����ڣ��ɶ���������̫ԭ����ɳ�����֣�������֣�ݣ���������������³ľ�룬���ݣ��������Ϸʣ��ߺ�"; 
      String s4 = "�������Ϻ���������죬���������������������Ͼ������ݣ����������ţ����ϣ��ൺ���人�����ݣ����ڣ��ɶ���������̫ԭ����ɳ�����֣�������֣�ݣ�����"; 
      SimHash hash1 = new SimHash(s3, 64, 8); 
      SimHash hash2 = new SimHash(s4, 64, 8); 
      hash1.getResult(hash2); 
     } 
      
    }
    package com.gta.simhash;
     
    import java.io.IOException; 
    import java.math.BigInteger; 
    import java.util.List; 
    import java.util.ArrayList; 
     
    import org.wltea.analyzer.lucene.IKAnalyzer; 
    import org.apache.lucene.analysis.TokenStream; 
    import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 
     
    public class SimHash { 
     private String tokens; 
        private int hashBits = 64; 
        private int distance = 5; 
      
     public SimHash(String tokens) 
     { 
      this.tokens = tokens; 
     } 
      
      
     public SimHash(String tokens, int hashBits, int distance) 
     { 
      this.tokens = tokens; 
      this.hashBits = hashBits; 
      this.distance = distance; 
     } 
      
      
     public List<TermDict> tokenizer() 
     { 
      List<TermDict> terms = new ArrayList<TermDict>(); 
      IKAnalyzer analyzer = new IKAnalyzer(true); 
      try { 
       TokenStream stream = analyzer.tokenStream("", this.tokens); 
       CharTermAttribute cta = stream.addAttribute(CharTermAttribute.class); 
       stream.reset(); 
       int index = -1; 
       while (stream.incrementToken())  
       { 
        if ((index = isContain(cta.toString(), terms)) >= 0) 
        { 
         terms.get(index).setFreq(terms.get(index).getFreq()+1); 
        } 
        else  
        { 
         terms.add(new TermDict(cta.toString(), 1)); 
        } 
       } 
       analyzer.close(); 
      } catch (IOException e) { 
       e.printStackTrace(); 
      } 
      return terms; 
     } 
      
      
     public int isContain(String str, List<TermDict> terms) 
     { 
      for (TermDict td : terms) 
      { 
       if (str.equals(td.getTerm())) 
       { 
        return terms.indexOf(td); 
       } 
      } 
      return -1; 
     } 
      
      
     public BigInteger simHash(List<TermDict> terms) 
     { 
      int []v = new int[hashBits]; 
      for (TermDict td : terms) 
      { 
       String str = td.getTerm(); 
       int weight = td.getFreq(); 
       BigInteger bt = shiftHash(str); 
       for (int i = 0; i < hashBits; i++) 
       { 
        BigInteger bitmask = new BigInteger("1").shiftLeft(i); 
        if ( bt.and(bitmask).signum() != 0) 
        { 
         v[i] += weight; 
        } 
        else 
        { 
         v[i] -= weight; 
        } 
       } 
      } 
       
      BigInteger fingerPrint = new BigInteger("0"); 
      for (int i = 0; i < hashBits; i++) 
      { 
       if (v[i] >= 0) 
       { 
        fingerPrint = fingerPrint.add(new BigInteger("1").shiftLeft(i));   // update the correct fingerPrint 
       } 
      } 
      return fingerPrint; 
     } 
      
      
     public BigInteger shiftHash(String str) 
     { 
      if (str == null || str.length() == 0) 
      { 
       return new BigInteger("0"); 
      } 
      else  
      { 
       char[] sourceArray = str.toCharArray(); 
       BigInteger x = BigInteger.valueOf((long) sourceArray[0] << 7); 
       BigInteger m = new BigInteger("131313"); 
       for (char item : sourceArray) 
       { 
        x = x.multiply(m).add(BigInteger.valueOf((long)item)); 
       } 
       BigInteger mask = new BigInteger("2").pow(hashBits).subtract(new BigInteger("1")); 
       boolean flag = true; 
       for (char item : sourceArray) 
       { 
        if (flag) 
        { 
         BigInteger tmp = BigInteger.valueOf((long)item << 3); 
         x = x.multiply(m).xor(tmp).and(mask); 
        } 
        else 
        { 
         BigInteger tmp = BigInteger.valueOf((long)item >> 3); 
         x = x.multiply(m).xor(tmp).and(mask); 
        } 
        flag = !flag; 
       } 
        
       if (x.equals(new BigInteger("-1"))) 
       { 
        x = new BigInteger("-2"); 
       } 
       return x; 
      } 
     } 
      
      
     public BigInteger getSimHash() 
     { 
      return simHash(tokenizer()); 
     } 
      
      
     public int getHammingDistance(SimHash hashData) 
     { 
      BigInteger m = new BigInteger("1").shiftLeft(hashBits).subtract(new BigInteger("1")); 
      System.out.println(getFingerPrint(getSimHash().toString(2))); 
      System.out.println(getFingerPrint(hashData.getSimHash().toString(2))); 
      BigInteger x = getSimHash().xor(hashData.getSimHash()).and(m); 
      int tot = 0; 
      while (x.signum() != 0) 
      { 
       tot += 1; 
       x = x.and(x.subtract(new BigInteger("1"))); 
      } 
      System.out.println(tot); 
      return tot; 
     } 
      
      
     public String getFingerPrint(String str) 
     { 
      int len = str.length(); 
      for (int i = 0; i < hashBits; i++) 
      { 
       if (i >= len) 
       { 
        str = "0" + str; 
       } 
      } 
      return str; 
     } 
      
      
     public void getResult(SimHash hashData) 
     { 
      if (getHammingDistance(hashData) <= distance) 
      { 
       System.out.println("match"); 
      } 
      else 
      { 
       System.out.println("false"); 
      } 
     } 
      
    }
  • 相关阅读:
    HDU 5059 Help him
    HDU 5058 So easy
    HDU 5056 Boring count
    HDU 5055 Bob and math problem
    HDU 5054 Alice and Bob
    HDU 5019 Revenge of GCD
    HDU 5018 Revenge of Fibonacci
    HDU 1556 Color the ball
    CodeForces 702D Road to Post Office
    CodeForces 702C Cellular Network
  • 原文地址:https://www.cnblogs.com/twodog/p/12141246.html
Copyright © 2011-2022 走看看