zoukankan      html  css  js  c++  java
  • 抓取之近似网页过滤

      抓取的网页内容中,有大部分会是相似的,抓取时就要过滤掉,开始考虑用VSM算法,后来发现不对,要比较太多东西了,然后就发现了simHash算法,这个算法的解释我就懒得copy了,simhash算法对于短数据的支持不好,但是,我本来就是很长的数据,用上!

      源码实现网上也有不少,但是貌似都是同样的,里面写得不清不楚的,虽然效果基本能达到,但是不清楚的东西,我用来做啥?

      仔细研究simhash算法的说明后,把里面字符串的hash算法换成的fvn-1算法,这个在http://www.isthe.com/chongo/tech/comp/fnv/里面有说明了,具体的那些固定数值,网站上都写了。原先代码里面有些处理,和算法不符的,也换掉了。

      首先搞起IKAnalyzer,切词并计算每个词的频率:

    package com.cnblogs.zxub.lucene.similarity;
    
    import java.io.IOException;
    import java.io.Reader;
    import java.io.StringReader;
    import java.util.HashMap;
    import java.util.Map;
    
    import org.apache.lucene.analysis.Analyzer;
    import org.apache.lucene.analysis.TokenStream;
    import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
    import org.wltea.analyzer.lucene.IKAnalyzer;
    
    public class WordsSpliter {
    
        public static Map<String, Integer> getSplitedWords(String str)
                throws IOException {
            // str = str.replaceAll("[0-9a-zA-Z]", "");
            Analyzer analyzer = new IKAnalyzer();
            Reader r = new StringReader(str);
            TokenStream ts = analyzer.tokenStream("searchValue", r);
            ts.addAttribute(CharTermAttribute.class);
    
            Map<String, Integer> result = new HashMap<String, Integer>();
            while (ts.incrementToken()) {
                CharTermAttribute ta = ts.getAttribute(CharTermAttribute.class);
                String word = ta.toString();
                if (!result.containsKey(word)) {
                    result.put(word, 0);
                }
                result.put(word, result.get(word) + 1);
            }
    
            return result;
        }
    }

       然后把SimHash的算法搞上:

    package com.cnblogs.zxub.lucene.similarity;
    
    import java.io.IOException;
    import java.math.BigInteger;
    import java.util.Map;
    import java.util.Set;
    
    public class SimHash {
    
        private static final int HASH_BITS = 64;
        private static final BigInteger FNV_64_INIT = new BigInteger(
                "14695981039346656037");
        private static final BigInteger FNV_64_PRIME = new BigInteger(
                "1099511628211");
        private static final BigInteger MASK_64 = BigInteger.ONE.shiftLeft(
                HASH_BITS).subtract(BigInteger.ONE);
    
        private String hash;
        private BigInteger signature;
    
        public SimHash(String content) throws IOException {
            super();
            this.analysis(content);
        }
    
        public String getHash() {
            return this.hash;
        }
    
        public BigInteger getSignature() {
            return this.signature;
        }
    
        private void analysis(String content) throws IOException {
            Map<String, Integer> wordInfos = WordsSpliter.getSplitedWords(content);
            int[] featureVector = new int[SimHash.HASH_BITS];
            Set<String> words = wordInfos.keySet();
            for (String word : words) {
                BigInteger wordhash = this.fnv1_64_hash(word);
                for (int i = 0; i < SimHash.HASH_BITS; i++) {
                    BigInteger bitmask = BigInteger.ONE.shiftLeft(SimHash.HASH_BITS
                            - i - 1);
                    if (wordhash.and(bitmask).signum() != 0) {
                        featureVector[i] += wordInfos.get(word);
                    } else {
                        featureVector[i] -= wordInfos.get(word);
                    }
                }
            }
    
            BigInteger signature = BigInteger.ZERO;
            StringBuffer hashBuffer = new StringBuffer();
            for (int i = 0; i < SimHash.HASH_BITS; i++) {
                if (featureVector[i] >= 0) {
                    signature = signature.add(BigInteger.ONE
                            .shiftLeft(SimHash.HASH_BITS - i - 1));
                    hashBuffer.append("1");
                } else {
                    hashBuffer.append("0");
                }
            }
            this.hash = hashBuffer.toString();
            this.signature = signature;
        }
    
        // fnv-1 hash算法,将字符串转换为64位hash值
        private BigInteger fnv1_64_hash(String str) {
            BigInteger hash = FNV_64_INIT;
            int len = str.length();
            for (int i = 0; i < len; i++) {
                hash = hash.multiply(FNV_64_PRIME);
                hash = hash.xor(BigInteger.valueOf(str.charAt(i)));
            }
            hash = hash.and(MASK_64);
            return hash;
        }
    
        public int getHammingDistance(BigInteger targetSignature) {
            BigInteger x = this.getSignature().xor(targetSignature);
            String s = x.toString(2);
            return s.replaceAll("0", "").length();
        }
    
        public int getHashDistance(String targetHash) {
            int distance;
            if (this.getHash().length() != targetHash.length()) {
                distance = -1;
            } else {
                distance = 0;
                for (int i = 0; i < this.getHash().length(); i++) {
                    if (this.getHash().charAt(i) != targetHash.charAt(i)) {
                        distance++;
                    }
                }
            }
            return distance;
        }
    }

      数据库里面存个签名就好了,至于距离运算,本打算全部拉出来计算,后来发现oracle的bitand函数,就用它了!异或之后,转二进制字符串,把0去掉,取长度,再count一下长度小于4的,得到的结果就是很相似的内容数目了。以后再把计算改成用缓存的去,先偷个懒。

      oracle函数部分贴上(注意Oracle的length函数永远不会返回0,最后要用个nvl函数,还有就是bitand在数值太大的时候,会溢出导致结果失误,所以要用utl_raw.bit_and,后面两个函数中字符串还不能用64位,改成128位搞定,估计还能小点,不弄了): 

    create or replace function bitxor(a in number,b in number) return number
    is
    begin
        return return a+b-2*to_number(utl_raw.bit_and(to_char(a),to_char(b)));
    end; create or replace function dec2bit(v_num number) return varchar is v_rtn varchar(128); v_n1 number; v_n2 number; begin v_n1 := v_num; loop v_n2 := mod(v_n1, 2); v_n1 := trunc(v_n1 / 2); v_rtn := to_char(v_n2) || v_rtn; exit when v_n1 = 0; end loop; return v_rtn; end; create or replace function hm_distance(a in number,b in number) return number is v_dis number; v_xor number; v_bit varchar(128); begin v_xor:=bitxor(a,b); v_bit:=dec2bit(v_xor); v_dis:=length(replace(v_bit,'0','')); return nvl(v_dis,0); end;

      跑一下 select hm_distance(1108937774045716955,1108937774045721051) from dual ,结果为1,o了。

      后面去用了下,发现fnv1居然正好撞到一个神奇的万金油,改成fnv1a就好了,代码就不改了。。。

  • 相关阅读:
    优先队列
    Problem W UVA 662 二十三 Fast Food
    UVA 607 二十二 Scheduling Lectures
    UVA 590 二十一 Always on the run
    UVA 442 二十 Matrix Chain Multiplication
    UVA 437 十九 The Tower of Babylon
    UVA 10254 十八 The Priest Mathematician
    UVA 10453 十七 Make Palindrome
    UVA 10163 十六 Storage Keepers
    UVA 1252 十五 Twenty Questions
  • 原文地址:https://www.cnblogs.com/zxub/p/3918422.html
Copyright © 2011-2022 走看看