zoukankan      html  css  js  c++  java
  • Heritrix 3.1.0 源码解析(三十二)

    本文要分析的是FetchDNS处理器,该处理器的功能是解析CrawlURI curi对象的DNS地址,该处理器是采用dnsjava-2.0.3.jar组件进行解析DNS的(我们可以参考本文代码采用dnsjava-2.0.3.jar组件API解析DNS)

    FetchDNS处理器的重要成员变量

    // Defaults.
        private short ClassType = DClass.IN;
        private short TypeType = Type.A;
        protected InetAddress serverInetAddr = null;
    
     /**
         * Used to do DNS lookups.
         */
        protected ServerCache serverCache;
        public ServerCache getServerCache() {
            return this.serverCache;
        }
        @Autowired
        public void setServerCache(ServerCache serverCache) {
            this.serverCache = serverCache;
        }
        
        /**
         * Whether or not to perform an on-the-fly digest hash of retrieved
         * content-bodies.
         */
        {
            setDigestContent(true);
        }
        public boolean getDigestContent() {
            return (Boolean) kp.get("digestContent");
        }
        public void setDigestContent(boolean digest) {
            kp.put("digestContent",digest);
        }
    
        /**
         * Which algorithm (for example MD5 or SHA-1) to use to perform an 
         * on-the-fly digest hash of retrieved content-bodies.
         */
        String digestAlgorithm = "sha1"; 
        public String getDigestAlgorithm() {
            return digestAlgorithm;
        }
        public void setDigestAlgorithm(String digestAlgorithm) {
            this.digestAlgorithm = digestAlgorithm;
        }

    处理器void innerProcess(CrawlURI curi)方法

    protected void innerProcess(CrawlURI curi) {
            Record[] rrecordSet = null; // Retrieved dns records
            String dnsName = null;
            try {
                dnsName = curi.getUURI().getReferencedHost();
            } catch (URIException e) {
                logger.log(Level.SEVERE, "Failed parse of dns record " + curi, e);
            }
            
            if(dnsName == null) {
                curi.setFetchStatus(S_UNFETCHABLE_URI);
                return;
            }
    
            CrawlHost targetHost = getServerCache().getHostFor(dnsName);
            //IP地址转换为InetAddress类型
            if (isQuadAddress(curi, dnsName, targetHost)) {
                // We're done processing.
                return;
            }
            
            // Do actual DNS lookup.
            curi.setFetchBeginTime(System.currentTimeMillis());
    
            // Try to get the records for this host (assume domain name)
            // TODO: Bug #935119 concerns potential hang here
            String lookupName = dnsName.endsWith(".") ? dnsName : dnsName + ".";
            try {
                //DNS解析
                rrecordSet = (new Lookup(lookupName, TypeType, ClassType)).run();
            } catch (TextParseException e) {
                rrecordSet = null;
            }
            curi.setContentType("text/dns");
            if (rrecordSet != null) {
                if (logger.isLoggable(Level.FINE)) {
                    logger.fine("Found recordset for " + lookupName);
                }
                //设置CrawlHost targetHost对象的IP属性; CrawlURI curi对象的Recorder httpRecorder属性
                storeDNSRecord(curi, dnsName, targetHost, rrecordSet);
            } else {
                if (logger.isLoggable(Level.FINE)) {
                    logger.fine("Failed find of recordset for " + lookupName);
                }
                if (getAcceptNonDnsResolves()||"localhost".equals(dnsName)) {
                    // Do lookup that bypasses javadns.
                    InetAddress address = null;
                    try {
                        address = InetAddress.getByName(dnsName);
                    } catch (UnknownHostException e1) {
                        address = null;
                    }
                    if (address != null) {
                        targetHost.setIP(address, DEFAULT_TTL_FOR_NON_DNS_RESOLVES);
                        curi.setFetchStatus(S_GETBYNAME_SUCCESS);
                        if (logger.isLoggable(Level.FINE)) {
                            logger.fine("Found address for " + dnsName +
                                " using native dns.");
                        }
                    } else {
                        if (logger.isLoggable(Level.FINE)) {
                            logger.fine("Failed find of address for " + dnsName +
                                " using native dns.");
                        }
                        setUnresolvable(curi, targetHost);
                    }
                } else {
                    setUnresolvable(curi, targetHost);
                }
            }
            curi.setFetchCompletedTime(System.currentTimeMillis());
        }

    相关调用方法如下(dnsjava-2.0.3.jar组件的API) 

    /**
         * 设置CrawlHost targetHost对象的IP属性; CrawlURI curi对象的Recorder httpRecorder属性
         * @param curi
         * @param dnsName
         * @param targetHost
         * @param rrecordSet
         */
        protected void storeDNSRecord(final CrawlURI curi, final String dnsName,
                final CrawlHost targetHost, final Record[] rrecordSet) {
            // Get TTL and IP info from the first A record (there may be
            // multiple, e.g. www.washington.edu) then update the CrawlServer
            ARecord arecord = getFirstARecord(rrecordSet);
            if (arecord == null) {
                throw new NullPointerException("Got null arecord for " +
                    dnsName);
            }
            //设置CrawlHost targetHost对象IP属性
            targetHost.setIP(arecord.getAddress(), arecord.getTTL());
            try {
                //CrawlURI curi对象的Recorder httpRecorder属性
                recordDNS(curi, rrecordSet);
                curi.setFetchStatus(S_DNS_SUCCESS);
                curi.setDNSServerIPLabel(ResolverConfig.getCurrentConfig().server());
            } catch (IOException e) {
                logger.log(Level.SEVERE, "Failed store of DNS Record for " +
                    curi.toString(), e);
                setUnresolvable(curi, targetHost);
            }
        }
        /**
         * IP地址转换为InetAddress
         * @param curi
         * @param dnsName
         * @param targetHost
         * @return
         */
        protected boolean isQuadAddress(final CrawlURI curi, final String dnsName,
                final CrawlHost targetHost) {
            boolean result = false;
            Matcher matcher = InetAddressUtil.IPV4_QUADS.matcher(dnsName);
            // If it's an ip no need to do a lookup
            if (matcher == null || !matcher.matches()) {
                return result;
            }
            
            result = true;
            // Ideally this branch would never be reached: no CrawlURI
            // would be created for numerical IPs
            if (logger.isLoggable(Level.WARNING)) {
                logger.warning("Unnecessary DNS CrawlURI created: " + curi);
            }
            try {
                targetHost.setIP(InetAddress.getByAddress(dnsName, new byte[] {
                        (byte) (new Integer(matcher.group(1)).intValue()),
                        (byte) (new Integer(matcher.group(2)).intValue()),
                        (byte) (new Integer(matcher.group(3)).intValue()),
                        (byte) (new Integer(matcher.group(4)).intValue()) }),
                        CrawlHost.IP_NEVER_EXPIRES); // Never expire numeric IPs
                curi.setFetchStatus(S_DNS_SUCCESS);
            } catch (UnknownHostException e) {
                logger.log(Level.SEVERE, "Should never be " + e.getMessage(), e);
                setUnresolvable(curi, targetHost);
            }
            return result;
        }
        /**
         * 封装到CrawlURI curi对象的Recorder httpRecorder属性
         * @param curi
         * @param rrecordSet
         * @throws IOException
         */
        protected void recordDNS(final CrawlURI curi, final Record[] rrecordSet)
                throws IOException {
            //转换为byte[]
            final byte[] dnsRecord = getDNSRecord(curi.getFetchBeginTime(),
                    rrecordSet);
    
            Recorder rec = curi.getRecorder();
            // Shall we get a digest on the content downloaded?
            boolean digestContent = getDigestContent();
            String algorithm = null;
            if (digestContent) {
                algorithm = getDigestAlgorithm();
                rec.getRecordedInput().setDigest(algorithm);
            } else {
                rec.getRecordedInput().setDigest((MessageDigest)null);
            }
            //byte[]转换为InputStream,封装到CrawlURI curi对象的Recorder httpRecorder属性
            InputStream is = curi.getRecorder().inputWrap(
                    new ByteArrayInputStream(dnsRecord));
    
            if (digestContent) {
                rec.getRecordedInput().startDigest();
            }
    
            // Reading from the wrapped stream, behind the scenes, will write
            // files into scratch space
            try {
                while (is.read(this.reusableBuffer) != -1) {
                    continue;
                }
            } finally {
                is.close();
                rec.closeRecorders();
            }
            curi.setContentSize(dnsRecord.length);
    
            if (digestContent) {
                curi.setContentDigest(algorithm,
                    rec.getRecordedInput().getDigestValue());
            }
        }
        /**
         * 转换为byte[]
         * @param fetchStart
         * @param rrecordSet
         * @return
         * @throws IOException
         */
        protected byte [] getDNSRecord(final long fetchStart,
                final Record[] rrecordSet)
        throws IOException {
            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            // Start the record with a 14-digit date per RFC 2540
            byte[] fetchDate = ArchiveUtils.get14DigitDate(fetchStart).getBytes();
            baos.write(fetchDate);
            // Don't forget the newline
            baos.write("\n".getBytes());
            int recordLength = fetchDate.length + 1;
            if (rrecordSet != null) {
                for (int i = 0; i < rrecordSet.length; i++) {
                    byte[] record = rrecordSet[i].toString().getBytes();
                    recordLength += record.length;
                    baos.write(record);
                    // Add the newline between records back in
                    baos.write("\n".getBytes());
                    recordLength += 1;
                }
            }
            return baos.toByteArray();
        }
        
        protected void setUnresolvable(CrawlURI curi, CrawlHost host) {
            host.setIP(null, 0);
            curi.setFetchStatus(S_DOMAIN_UNRESOLVABLE); 
        }
        /**
         * 返回Record[] rrecordSet数组Type.A类型的Record元素
         * @param rrecordSet
         * @return
         */
        protected ARecord getFirstARecord(Record[] rrecordSet) {
            ARecord arecord = null;
            if (rrecordSet == null || rrecordSet.length == 0) {
                if (logger.isLoggable(Level.FINEST)) {
                    logger.finest("rrecordSet is null or zero length: " +
                        rrecordSet);
                }
                return arecord;
            }
            for (int i = 0; i < rrecordSet.length; i++) {
                if (rrecordSet[i].getType() != Type.A) {
                    if (logger.isLoggable(Level.FINEST)) {
                        logger.finest("Record " + Integer.toString(i) +
                            " is not A type but " + rrecordSet[i].getType());
                    }
                    continue;
                }
                arecord = (ARecord) rrecordSet[i];
                break;
            }
            return arecord;
        }

    FetchDNS处理器和后面的FetchHTTP处理器涉及到消息摘要算法MessageDigest digest 对象,我这里转自网上的一篇文章供参考 

    转自 http://huangyunbin.iteye.com/blog/1123442

    MessageDigest的功能及用法

    MessageDigest 类为应用程序提供信息摘要算法的功能,如 MD5 或 SHA 算法。信息摘要是安全的单向哈希函数,它接收任意大小的数据,并输出固定长度的哈希值。 

    MessageDigest 对象开始被初始化。该对象通过使用 update()方法处理数据。任何时候都可以调用 reset()方法重置摘要。一旦所有需要更新的数据都已经被更新了,应该调用digest() 方法之一完成哈希计算。 

    对于给定数量的更新数据,digest 方法只能被调用一次。在调用 digest 之后,MessageDigest 对象被重新设置成其初始状态。 

    1、public static MessageDigest getInstance(String algorithm) 
                                     throws NoSuchAlgorithmException 

       返回实现指定摘要算法的 MessageDigest 对象。 

       algorithm - 所请求算法的名称 

    2、public static MessageDigest getInstance(String algorithm, 
                                            String provider) 
                                     throws NoSuchAlgorithmException, 
                                            NoSuchProviderException 

      返回实现指定摘要算法的 MessageDigest 对象。 

      algorithm - 所请求算法的名称 

      provider - 提供者的名称。 

    3、public void update(byte[] input) 

      使用指定的 byte 数组更新摘要。 

    4、public byte[] digest() 

      通过执行诸如填充之类的最终操作完成哈希计算。在调用此方法之后,摘要被重置。 

    5、public static boolean isEqual(byte[] digesta, 
                                  byte[] digestb) 

        比较两个摘要的相等性。做简单的字节比较。 


    注意:Provider可以通过 java.security.Security.getProviders() 方法获取已注册提供者列表。比较常用的有“SUN” 

    SUN提供的常用的算法名称有:MD2 
    MD5 
                            SHA-1 
                            SHA-256 
                            SHA-384 
                            SHA-512 

    Code举例: 

    import java.security.*; 
    public class myDigest { 
      public static void main(String[] args)  { 
        myDigest my=new myDigest(); 
        my.testDigest(); 
      } 
      public void testDigest() 
      { 
       try { 
         String myinfo="我的测试信息"; 
        //java.security.MessageDigest alg=java.security.MessageDigest.getInstance("MD5"); 
          java.security.MessageDigest alga=java.security.MessageDigest.getInstance("SHA-1"); 
          alga.update(myinfo.getBytes()); 
          byte[] digesta=alga.digest(); 
          System.out.println("本信息摘要是:"+byte2hex(digesta)); 
          //通过某中方式传给其他人你的信息(myinfo)和摘要(digesta) 对方可以判断是否更改或传输正常 
          java.security.MessageDigest algb=java.security.MessageDigest.getInstance("SHA-1"); 
          algb.update(myinfo.getBytes()); 
          if (algb.isEqual(digesta,algb.digest())) { 
             System.out.println("信息检查正常"); 
           } 
           else 
            { 
              System.out.println("摘要不相同"); 
             } 
       } 
       catch (java.security.NoSuchAlgorithmException ex) { 
         System.out.println("非法摘要算法"); 
       } 
      } 
      public String byte2hex(byte[] b) //二行制转字符串 
        { 
         String hs=""; 
         String stmp=""; 
         for (int n=0;n<b.length;n++) 
          { 
           stmp=(java.lang.Integer.toHexString(b[n] & 0XFF)); 
           if (stmp.length()==1) hs=hs+"0"+stmp; 
           else hs=hs+stmp; 
           if (n<b.length-1)  hs=hs+":"; 
          } 
         return hs.toUpperCase(); 
        } 
    } 

    关于Java加密的更多信息:http://www.ibm.com/developerworks/cn/java/l-security/

    --------------------------------------------------------------------------

    本系列Heritrix 3.1.0 源码解析系本人原创

    转载请注明出处 博客园 刺猬的温驯

    本文链接 http://www.cnblogs.com/chenying99/archive/2013/04/30/3052411.html

  • 相关阅读:
    BZOJ 2212/BZOJ 3702
    BZOJ 4761 Cow Navigation
    BZOJ 3209 花神的数论题
    BZOJ 4760 Hoof, Paper, Scissors
    BZOJ 3620 似乎在梦中见过的样子
    BZOJ 3940 Censoring
    BZOJ 3942 Censoring
    BZOJ 3571 画框
    BZOJ 1937 最小生成树
    BZOJ 1058 报表统计
  • 原文地址:https://www.cnblogs.com/chenying99/p/3052411.html
Copyright © 2011-2022 走看看