zoukankan      html  css  js  c++  java
  • 网络爬虫速成指南 (四) URL判重

     
    如果采集量比较小:
    布隆过滤器详解原理

    如果采集量比较大:
    redis:主要是把URL转为md5,作为key来进行判重

    关于布隆过滤器器的参数说明,简单点说: private static BloomFilter<String> bloomFilter = new BloomFilter<String>(2X, X); 这样用就可以了
    附:布隆过滤器实现,@author Magnus Skjegstad <magnus@skjegstad.com>
    import java.io.Serializable; import java.nio.charset.Charset; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.BitSet; import java.util.Collection; /** * Implementation of a Bloom-filter, as described here: * http://en.wikipedia.org/wiki/Bloom_filter * * Inspired by the SimpleBloomFilter-class written by Ian Clarke. This * implementation provides a more evenly distributed Hash-function by using a * proper digest instead of the Java RNG. Many of the changes were proposed in * comments in his blog: * http://blog.locut.us/2008/01/12/a-decent-stand-alone-java * -bloom-filter-implementation/ * * @param <E> * Object type that is to be inserted into the Bloom filter, e.g. * String or Integer. * @author Magnus Skjegstad <magnus@skjegstad.com> */ public class BloomFilter<E> implements Serializable { private BitSet bitset; private int bitSetSize; private double bitsPerElement; private int expectedNumberOfFilterElements; // expected (maximum) number of // elements to be added private int numberOfAddedElements; // number of elements actually added to // the Bloom filter private int k; // number of hash functions static final Charset charset = Charset.forName("UTF-8"); // encoding used // for storing // hash values // as strings static final String hashName = "MD5"; // MD5 gives good enough accuracy in // most circumstances. Change to // SHA1 if it's needed static final MessageDigest digestFunction; static { // The digest method is reused between instances MessageDigest tmp; try { tmp = java.security.MessageDigest.getInstance(hashName); } catch (NoSuchAlgorithmException e) { tmp = null; } digestFunction = tmp; } /** * Constructs an empty Bloom filter. The total length of the Bloom filter * will be c*n. * * @param c * is the number of bits used per element. * @param n * is the expected number of elements the filter will contain. * @param k * is the number of hash functions used. */ public BloomFilter(double c, int n, int k) { this.expectedNumberOfFilterElements = n; this.k = k; this.bitsPerElement = c; this.bitSetSize = (int) Math.ceil(c * n); numberOfAddedElements = 0; this.bitset = new BitSet(bitSetSize); } /** * Constructs an empty Bloom filter. The optimal number of hash functions * (k) is estimated from the total size of the Bloom and the number of * expected elements. * * @param bitSetSize * defines how many bits should be used in total for the filter. * @param expectedNumberOElements * defines the maximum number of elements the filter is expected * to contain. */ public BloomFilter(int bitSetSize, int expectedNumberOElements) { this(bitSetSize / (double) expectedNumberOElements, expectedNumberOElements, (int) Math .round((bitSetSize / (double) expectedNumberOElements) * Math.log(2.0))); } /** * Constructs an empty Bloom filter with a given false positive probability. * The number of bits per element and the number of hash functions is * estimated to match the false positive probability. * * @param falsePositiveProbability * is the desired false positive probability. * @param expectedNumberOfElements * is the expected number of elements in the Bloom filter. */ public BloomFilter(double falsePositiveProbability, int expectedNumberOfElements) { this(Math.ceil(-(Math.log(falsePositiveProbability) / Math.log(2))) / Math.log(2), // c = k / ln(2) expectedNumberOfElements, (int) Math.ceil(-(Math .log(falsePositiveProbability) / Math.log(2)))); // k = // ceil // ( // - // log_2 // ( // false // prob // . // ) // ) } /** * Construct a new Bloom filter based on existing Bloom filter data. * * @param bitSetSize * defines how many bits should be used for the filter. * @param expectedNumberOfFilterElements * defines the maximum number of elements the filter is expected * to contain. * @param actualNumberOfFilterElements * specifies how many elements have been inserted into the * <code>filterData</code> BitSet. * @param filterData * a BitSet representing an existing Bloom filter. */ public BloomFilter(int bitSetSize, int expectedNumberOfFilterElements, int actualNumberOfFilterElements, BitSet filterData) { this(bitSetSize, expectedNumberOfFilterElements); this.bitset = filterData; this.numberOfAddedElements = actualNumberOfFilterElements; } /** * Generates a digest based on the contents of a String. * * @param val * specifies the input data. * @param charset * specifies the encoding of the input data. * @return digest as long. */ public static long createHash(String val, Charset charset) { return createHash(val.getBytes(charset)); } /** * Generates a digest based on the contents of a String. * * @param val * specifies the input data. The encoding is expected to be * UTF-8. * @return digest as long. */ public static long createHash(String val) { return createHash(val, charset); } /** * Generates a digest based on the contents of an array of bytes. * * @param data * specifies input data. * @return digest as long. */ public static long createHash(byte[] data) { long h = 0; byte[] res; synchronized (digestFunction) { res = digestFunction.digest(data); } for (int i = 0; i < 4; i++) { h <<= 8; h |= ((int) res[i]) & 0xFF; } return h; } /** * Compares the contents of two instances to see if they are equal. * * @param obj * is the object to compare to. * @return True if the contents of the objects are equal. */ @Override public boolean equals(Object obj) { if (obj == null) { return false; } if (getClass() != obj.getClass()) { return false; } final BloomFilter<E> other = (BloomFilter<E>) obj; if (this.expectedNumberOfFilterElements != other.expectedNumberOfFilterElements) { return false; } if (this.k != other.k) { return false; } if (this.bitSetSize != other.bitSetSize) { return false; } if (this.bitset != other.bitset && (this.bitset == null || !this.bitset.equals(other.bitset))) { return false; } return true; } /** * Calculates a hash code for this class. * * @return hash code representing the contents of an instance of this class. */ @Override public int hashCode() { int hash = 7; hash = 61 * hash + (this.bitset != null ? this.bitset.hashCode() : 0); hash = 61 * hash + this.expectedNumberOfFilterElements; hash = 61 * hash + this.bitSetSize; hash = 61 * hash + this.k; return hash; } /** * Calculates the expected probability of false positives based on the * number of expected filter elements and the size of the Bloom filter. <br * /><br /> The value returned by this method is the <i>expected</i> rate of * false positives, assuming the number of inserted elements equals the * number of expected elements. If the number of elements in the Bloom * filter is less than the expected value, the true probability of false * positives will be lower. * * @return expected probability of false positives. */ public double expectedFalsePositiveProbability() { return getFalsePositiveProbability(expectedNumberOfFilterElements); } /** * Calculate the probability of a false positive given the specified number * of inserted elements. * * @param numberOfElements * number of inserted elements. * @return probability of a false positive. */ public double getFalsePositiveProbability(double numberOfElements) { // (1 - e^(-k * n / m)) ^ k return Math.pow((1 - Math.exp(-k * (double) numberOfElements / (double) bitSetSize)), k); } /** * Get the current probability of a false positive. The probability is * calculated from the size of the Bloom filter and the current number of * elements added to it. * * @return probability of false positives. */ public double getFalsePositiveProbability() { return getFalsePositiveProbability(numberOfAddedElements); } /** * Returns the value chosen for K.<br /> <br /> K is the optimal number of * hash functions based on the size of the Bloom filter and the expected * number of inserted elements. * * @return optimal k. */ public int getK() { return k; } /** * Sets all bits to false in the Bloom filter. */ public void clear() { bitset.clear(); numberOfAddedElements = 0; } /** * Adds an object to the Bloom filter. The output from the object's * toString() method is used as input to the hash functions. * * @param element * is an element to register in the Bloom filter. */ public void add(E element) { long hash; String valString = element.toString(); for (int x = 0; x < k; x++) { hash = createHash(valString + Integer.toString(x)); hash = hash % (long) bitSetSize; bitset.set(Math.abs((int) hash), true); } numberOfAddedElements++; } /** * Adds all elements from a Collection to the Bloom filter. * * @param c * Collection of elements. */ public void addAll(Collection<? extends E> c) { for (E element : c) add(element); } /** * Returns true if the element could have been inserted into the Bloom * filter. Use getFalsePositiveProbability() to calculate the probability of * this being correct. * * @param element * element to check. * @return true if the element could have been inserted into the Bloom * filter. */ public boolean contains(E element) { long hash; String valString = element.toString(); for (int x = 0; x < k; x++) { hash = createHash(valString + Integer.toString(x)); hash = hash % (long) bitSetSize; if (!bitset.get(Math.abs((int) hash))) return false; } return true; } /** * Returns true if all the elements of a Collection could have been inserted * into the Bloom filter. Use getFalsePositiveProbability() to calculate the * probability of this being correct. * * @param c * elements to check. * @return true if all the elements in c could have been inserted into the * Bloom filter. */ public boolean containsAll(Collection<? extends E> c) { for (E element : c) if (!contains(element)) return false; return true; } /** * Read a single bit from the Bloom filter. * * @param bit * the bit to read. * @return true if the bit is set, false if it is not. */ public boolean getBit(int bit) { return bitset.get(bit); } /** * Set a single bit in the Bloom filter. * * @param bit * is the bit to set. * @param value * If true, the bit is set. If false, the bit is cleared. */ public void setBit(int bit, boolean value) { bitset.set(bit, value); } /** * Return the bit set used to store the Bloom filter. * * @return bit set representing the Bloom filter. */ public BitSet getBitSet() { return bitset; } /** * Returns the number of bits in the Bloom filter. Use count() to retrieve * the number of inserted elements. * * @return the size of the bitset used by the Bloom filter. */ public int size() { return this.bitSetSize; } /** * Returns the number of elements added to the Bloom filter after it was * constructed or after clear() was called. * * @return number of elements added to the Bloom filter. */ public int count() { return this.numberOfAddedElements; } /** * Returns the expected number of elements to be inserted into the filter. * This value is the same value as the one passed to the constructor. * * @return expected number of elements. */ public int getExpectedNumberOfElements() { return expectedNumberOfFilterElements; } /** * Get expected number of bits per element when the Bloom filter is full. * This value is set by the constructor when the Bloom filter is created. * See also getBitsPerElement(). * * @return expected number of bits per element. */ public double getExpectedBitsPerElement() { return this.bitsPerElement; } /** * Get actual number of bits per element based on the number of elements * that have currently been inserted and the length of the Bloom filter. See * also getExpectedBitsPerElement(). * * @return number of bits per element. */ public double getBitsPerElement() { return this.bitSetSize / (double) numberOfAddedElements; } }
    package com.lietu.show;
    
    import java.io.IOException;
    import java.sql.Connection;
    import java.sql.DriverManager;
    import java.sql.PreparedStatement;
    import java.sql.SQLException;
    import java.util.Properties;
    
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    //提取商品信息
    public class ExtractProduct {
    
        static BloomFilter<String> urlSeen = new BloomFilter<String>(200000, 20000);
    
        /**
         * @param args
         * @throws IOException
         */
        public static void main(String[] args) throws IOException {
            String url = "http://www.xiu.com/about/classifymap.shtml";
            Document doc = Jsoup.connect(url).get();
            Elements links = doc.select("a[href]"); // 带有href属性的a标签
            for (Element link : links) { // 遍历每个链接
                String linkHref = link.attr("href"); // 得到href属性中的值,也就是url地址
                if (linkHref.startsWith("http://list.xiu.com/")
                        || linkHref.startsWith("http://brand.xiu.com/")) {
                    linkHref = getListURL(linkHref);
                    if (urlSeen.contains(linkHref))
                        continue;
                    // System.out.println(linkHref); //输出url地址和锚点上的文字说明
                    extractList(linkHref);
                }
            }
        }
        
        public static String getListURL(String oldURL){
            ParseURL splitURL = new ParseURL(oldURL);
            if(splitURL.searchparms==null){
                return splitURL.baseURL;
            }
            //System.out.println(splitURL.baseURL);
            String curPage = splitURL.searchparms.get("currentPage");
            // http://list.xiu.com/20069.html?currentPage=2
            if (curPage != null)
                //System.out.println(splitURL.baseURL + "?currentPage=" + curPage);
                return splitURL.baseURL + "?currentPage=" + curPage;
            else{
                //System.out.println(splitURL.baseURL);
                return splitURL.baseURL;
            }
        }
    
        //处理目录页
        // http://list.xiu.com/19757.html
        // http://brand.xiu.com/3117.html
        public static void extractList(String url) {
            System.out.println("List url:" + url);
            urlSeen.add(url);
            Document doc = null;
            try {
                doc = Jsoup.connect(url).get();
            } catch (Exception e) {
                try {
                    doc = Jsoup.connect(url).get();
                } catch (IOException e1) {
                    // TODO Auto-generated catch block
                    e1.printStackTrace();
                    return;
                }
            }
    
            Elements links = doc.select("a[href]"); // 带有href属性的a标签
            for (Element link : links) { // 遍历每个链接
                String linkHref = link.attr("href"); // 得到href属性中的值,也就是url地址
                if (linkHref.startsWith("http://list.xiu.com/")
                        || linkHref.startsWith("http://brand.xiu.com/")) {
                    linkHref = getListURL(linkHref);
                    if (urlSeen.contains(linkHref))
                        continue;
                    extractList(linkHref); // 输出url地址和锚点上的文字说明
                } else if (linkHref.startsWith("http://item.xiu.com/product/")) { // "http://item.xiu.com/product/0359097.shtml"
                    if (urlSeen.contains(linkHref))
                        continue;
                    getProduct(linkHref);
                }
            }
        }
    
        //处理详细页
        public static void getProduct(String url) {
            System.out.println("Product url:" + url);
            urlSeen.add(url);
            // String url = "http://item.xiu.com/product/0359097.shtml";
            Document doc = null;
            try {
                doc = Jsoup.connect(url).get();
            } catch (Exception e) {
                try{
                    doc = Jsoup.connect(url).get();
                }
                catch (Exception e2) {
                    try {
                        doc = Jsoup.connect(url).get();
                    } catch (IOException e1) {
                        // TODO Auto-generated catch block
                        e1.printStackTrace();
                        return;
                    }
                }
            }
            Elements links = doc.select("div.p_title"); // 带有href属性的a标签
            String name = links.get(0).childNode(1).childNode(0).toString();
            // System.out.println(name);
    
            Element link = doc.select("div.conlist").get(2);
            String desc = link.text();
            // System.out.println(link.text());
    
            Element img = doc.select("#imgPic").first();
            String imgSrc = img.attr("src");
            // System.out.println(img.attr("src"));
            
            Element thumb = doc.select("img[onload]").first();
            String thumbSrc = thumb.attr("src");
            //System.out.println(thumbSrc);
            
            insertDatabase(url,name,desc,imgSrc,thumbSrc);
        }
    
        public static void insertDatabase(String url, String name, String desc,
                String img,String thumbSrc) {
            Connection conn = null;
            PreparedStatement stmt2 = null;
            try {
                Class.forName("sun.jdbc.odbc.JdbcOdbcDriver");
                Properties p = new Properties(); 
                p.put("charSet", "gb2312");
                conn = DriverManager
                        .getConnection("jdbc:odbc:driver={Microsoft Access Driver (*.mdb)};DBQ=good.mdb",p);
    
                stmt2 = conn
                        .prepareStatement("insert into good(url,productName,productDesc,img,thumb)values(?,?,?,?,?)");
                stmt2.setString(1, url);
                stmt2.setString(2, name);
                stmt2.setString(3, desc);
                stmt2.setString(4, img);
                stmt2.setString(5, thumbSrc);
                stmt2.executeUpdate();
            } catch (Exception ex) {
                ex.printStackTrace();
            } finally {
                try {
                    if (stmt2 != null)
                        stmt2.close();
                    if (conn != null)
                        conn.close();
                } catch (SQLException e) {
                    e.printStackTrace();
                }
            }
        }
    
    }

      给定误算率p,插入元素个数n,那么最优位数组大小为:

     

    Hash函数个数选取最优数目为:

      

    
    python 实现:
    
    import math
    
    N = 30000000000.0
    P = 0.001
    
    def get_len_arr(n, p):
        return -(n * math.log(p) / (math.log(2) ** 2))
    
    
    def get_k(p):
        return - (math.log(p) / math.log(2))
    
    m = get_len_arr(N, P)
    
    print m, 'bit'
    
    g = m / (8 * 1024 * 1024 * 1024)
    
    print g, 'GB'
    
    k = get_k(P)
    
    print k, 'ge hash hanshu'
    =========================
    4.31327626982e+11 bit
    50.2131445079 GB 300亿,50G内存。
    9.96578428466 ge hash hanshu
  • 相关阅读:
    Jenkins知识地图
    PageObject&PageFactory
    robot framework-databaselibaray库使用(python)
    robot framework环境搭建
    使用instantclient_11_2 和PL/SQL Developer工具包连接oracle 11g远程数据库(转)
    (转)pip和easy_install使用方式
    图解:如何在LINUX中安装VM-Tools
    ANT简明教程[转载]
    Linux常用命令
    oracle数据分组
  • 原文地址:https://www.cnblogs.com/i80386/p/2818157.html
Copyright © 2011-2022 走看看