zoukankan      html  css  js  c++  java
  • Heritrix 3.1.0 源码解析(三十二)

    本文要分析的是FetchDNS处理器,该处理器的功能是解析CrawlURI curi对象的DNS地址,该处理器是采用dnsjava-2.0.3.jar组件进行解析DNS的(我们可以参考本文代码采用dnsjava-2.0.3.jar组件API解析DNS)

    FetchDNS处理器的重要成员变量

    // Defaults.
        private short ClassType = DClass.IN;
        private short TypeType = Type.A;
        protected InetAddress serverInetAddr = null;
    
     /**
         * Used to do DNS lookups.
         */
        protected ServerCache serverCache;
        public ServerCache getServerCache() {
            return this.serverCache;
        }
        @Autowired
        public void setServerCache(ServerCache serverCache) {
            this.serverCache = serverCache;
        }
        
        /**
         * Whether or not to perform an on-the-fly digest hash of retrieved
         * content-bodies.
         */
        {
            setDigestContent(true);
        }
        public boolean getDigestContent() {
            return (Boolean) kp.get("digestContent");
        }
        public void setDigestContent(boolean digest) {
            kp.put("digestContent",digest);
        }
    
        /**
         * Which algorithm (for example MD5 or SHA-1) to use to perform an 
         * on-the-fly digest hash of retrieved content-bodies.
         */
        String digestAlgorithm = "sha1"; 
        public String getDigestAlgorithm() {
            return digestAlgorithm;
        }
        public void setDigestAlgorithm(String digestAlgorithm) {
            this.digestAlgorithm = digestAlgorithm;
        }

    处理器void innerProcess(CrawlURI curi)方法

    protected void innerProcess(CrawlURI curi) {
            Record[] rrecordSet = null; // Retrieved dns records
            String dnsName = null;
            try {
                dnsName = curi.getUURI().getReferencedHost();
            } catch (URIException e) {
                logger.log(Level.SEVERE, "Failed parse of dns record " + curi, e);
            }
            
            if(dnsName == null) {
                curi.setFetchStatus(S_UNFETCHABLE_URI);
                return;
            }
    
            CrawlHost targetHost = getServerCache().getHostFor(dnsName);
            //IP地址转换为InetAddress类型
            if (isQuadAddress(curi, dnsName, targetHost)) {
                // We're done processing.
                return;
            }
            
            // Do actual DNS lookup.
            curi.setFetchBeginTime(System.currentTimeMillis());
    
            // Try to get the records for this host (assume domain name)
            // TODO: Bug #935119 concerns potential hang here
            String lookupName = dnsName.endsWith(".") ? dnsName : dnsName + ".";
            try {
                //DNS解析
                rrecordSet = (new Lookup(lookupName, TypeType, ClassType)).run();
            } catch (TextParseException e) {
                rrecordSet = null;
            }
            curi.setContentType("text/dns");
            if (rrecordSet != null) {
                if (logger.isLoggable(Level.FINE)) {
                    logger.fine("Found recordset for " + lookupName);
                }
                //设置CrawlHost targetHost对象的IP属性; CrawlURI curi对象的Recorder httpRecorder属性
                storeDNSRecord(curi, dnsName, targetHost, rrecordSet);
            } else {
                if (logger.isLoggable(Level.FINE)) {
                    logger.fine("Failed find of recordset for " + lookupName);
                }
                if (getAcceptNonDnsResolves()||"localhost".equals(dnsName)) {
                    // Do lookup that bypasses javadns.
                    InetAddress address = null;
                    try {
                        address = InetAddress.getByName(dnsName);
                    } catch (UnknownHostException e1) {
                        address = null;
                    }
                    if (address != null) {
                        targetHost.setIP(address, DEFAULT_TTL_FOR_NON_DNS_RESOLVES);
                        curi.setFetchStatus(S_GETBYNAME_SUCCESS);
                        if (logger.isLoggable(Level.FINE)) {
                            logger.fine("Found address for " + dnsName +
                                " using native dns.");
                        }
                    } else {
                        if (logger.isLoggable(Level.FINE)) {
                            logger.fine("Failed find of address for " + dnsName +
                                " using native dns.");
                        }
                        setUnresolvable(curi, targetHost);
                    }
                } else {
                    setUnresolvable(curi, targetHost);
                }
            }
            curi.setFetchCompletedTime(System.currentTimeMillis());
        }

    相关调用方法如下(dnsjava-2.0.3.jar组件的API) 

    /**
         * 设置CrawlHost targetHost对象的IP属性; CrawlURI curi对象的Recorder httpRecorder属性
         * @param curi
         * @param dnsName
         * @param targetHost
         * @param rrecordSet
         */
        protected void storeDNSRecord(final CrawlURI curi, final String dnsName,
                final CrawlHost targetHost, final Record[] rrecordSet) {
            // Get TTL and IP info from the first A record (there may be
            // multiple, e.g. www.washington.edu) then update the CrawlServer
            ARecord arecord = getFirstARecord(rrecordSet);
            if (arecord == null) {
                throw new NullPointerException("Got null arecord for " +
                    dnsName);
            }
            //设置CrawlHost targetHost对象IP属性
            targetHost.setIP(arecord.getAddress(), arecord.getTTL());
            try {
                //CrawlURI curi对象的Recorder httpRecorder属性
                recordDNS(curi, rrecordSet);
                curi.setFetchStatus(S_DNS_SUCCESS);
                curi.setDNSServerIPLabel(ResolverConfig.getCurrentConfig().server());
            } catch (IOException e) {
                logger.log(Level.SEVERE, "Failed store of DNS Record for " +
                    curi.toString(), e);
                setUnresolvable(curi, targetHost);
            }
        }
        /**
         * IP地址转换为InetAddress
         * @param curi
         * @param dnsName
         * @param targetHost
         * @return
         */
        protected boolean isQuadAddress(final CrawlURI curi, final String dnsName,
                final CrawlHost targetHost) {
            boolean result = false;
            Matcher matcher = InetAddressUtil.IPV4_QUADS.matcher(dnsName);
            // If it's an ip no need to do a lookup
            if (matcher == null || !matcher.matches()) {
                return result;
            }
            
            result = true;
            // Ideally this branch would never be reached: no CrawlURI
            // would be created for numerical IPs
            if (logger.isLoggable(Level.WARNING)) {
                logger.warning("Unnecessary DNS CrawlURI created: " + curi);
            }
            try {
                targetHost.setIP(InetAddress.getByAddress(dnsName, new byte[] {
                        (byte) (new Integer(matcher.group(1)).intValue()),
                        (byte) (new Integer(matcher.group(2)).intValue()),
                        (byte) (new Integer(matcher.group(3)).intValue()),
                        (byte) (new Integer(matcher.group(4)).intValue()) }),
                        CrawlHost.IP_NEVER_EXPIRES); // Never expire numeric IPs
                curi.setFetchStatus(S_DNS_SUCCESS);
            } catch (UnknownHostException e) {
                logger.log(Level.SEVERE, "Should never be " + e.getMessage(), e);
                setUnresolvable(curi, targetHost);
            }
            return result;
        }
        /**
         * 封装到CrawlURI curi对象的Recorder httpRecorder属性
         * @param curi
         * @param rrecordSet
         * @throws IOException
         */
        protected void recordDNS(final CrawlURI curi, final Record[] rrecordSet)
                throws IOException {
            //转换为byte[]
            final byte[] dnsRecord = getDNSRecord(curi.getFetchBeginTime(),
                    rrecordSet);
    
            Recorder rec = curi.getRecorder();
            // Shall we get a digest on the content downloaded?
            boolean digestContent = getDigestContent();
            String algorithm = null;
            if (digestContent) {
                algorithm = getDigestAlgorithm();
                rec.getRecordedInput().setDigest(algorithm);
            } else {
                rec.getRecordedInput().setDigest((MessageDigest)null);
            }
            //byte[]转换为InputStream,封装到CrawlURI curi对象的Recorder httpRecorder属性
            InputStream is = curi.getRecorder().inputWrap(
                    new ByteArrayInputStream(dnsRecord));
    
            if (digestContent) {
                rec.getRecordedInput().startDigest();
            }
    
            // Reading from the wrapped stream, behind the scenes, will write
            // files into scratch space
            try {
                while (is.read(this.reusableBuffer) != -1) {
                    continue;
                }
            } finally {
                is.close();
                rec.closeRecorders();
            }
            curi.setContentSize(dnsRecord.length);
    
            if (digestContent) {
                curi.setContentDigest(algorithm,
                    rec.getRecordedInput().getDigestValue());
            }
        }
        /**
         * 转换为byte[]
         * @param fetchStart
         * @param rrecordSet
         * @return
         * @throws IOException
         */
        protected byte [] getDNSRecord(final long fetchStart,
                final Record[] rrecordSet)
        throws IOException {
            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            // Start the record with a 14-digit date per RFC 2540
            byte[] fetchDate = ArchiveUtils.get14DigitDate(fetchStart).getBytes();
            baos.write(fetchDate);
            // Don't forget the newline
            baos.write("\n".getBytes());
            int recordLength = fetchDate.length + 1;
            if (rrecordSet != null) {
                for (int i = 0; i < rrecordSet.length; i++) {
                    byte[] record = rrecordSet[i].toString().getBytes();
                    recordLength += record.length;
                    baos.write(record);
                    // Add the newline between records back in
                    baos.write("\n".getBytes());
                    recordLength += 1;
                }
            }
            return baos.toByteArray();
        }
        
        protected void setUnresolvable(CrawlURI curi, CrawlHost host) {
            host.setIP(null, 0);
            curi.setFetchStatus(S_DOMAIN_UNRESOLVABLE); 
        }
        /**
         * 返回Record[] rrecordSet数组Type.A类型的Record元素
         * @param rrecordSet
         * @return
         */
        protected ARecord getFirstARecord(Record[] rrecordSet) {
            ARecord arecord = null;
            if (rrecordSet == null || rrecordSet.length == 0) {
                if (logger.isLoggable(Level.FINEST)) {
                    logger.finest("rrecordSet is null or zero length: " +
                        rrecordSet);
                }
                return arecord;
            }
            for (int i = 0; i < rrecordSet.length; i++) {
                if (rrecordSet[i].getType() != Type.A) {
                    if (logger.isLoggable(Level.FINEST)) {
                        logger.finest("Record " + Integer.toString(i) +
                            " is not A type but " + rrecordSet[i].getType());
                    }
                    continue;
                }
                arecord = (ARecord) rrecordSet[i];
                break;
            }
            return arecord;
        }

    FetchDNS处理器和后面的FetchHTTP处理器涉及到消息摘要算法MessageDigest digest 对象,我这里转自网上的一篇文章供参考 

    转自 http://huangyunbin.iteye.com/blog/1123442

    MessageDigest的功能及用法

    MessageDigest 类为应用程序提供信息摘要算法的功能,如 MD5 或 SHA 算法。信息摘要是安全的单向哈希函数,它接收任意大小的数据,并输出固定长度的哈希值。 

    MessageDigest 对象开始被初始化。该对象通过使用 update()方法处理数据。任何时候都可以调用 reset()方法重置摘要。一旦所有需要更新的数据都已经被更新了,应该调用digest() 方法之一完成哈希计算。 

    对于给定数量的更新数据,digest 方法只能被调用一次。在调用 digest 之后,MessageDigest 对象被重新设置成其初始状态。 

    1、public static MessageDigest getInstance(String algorithm) 
                                     throws NoSuchAlgorithmException 

       返回实现指定摘要算法的 MessageDigest 对象。 

       algorithm - 所请求算法的名称 

    2、public static MessageDigest getInstance(String algorithm, 
                                            String provider) 
                                     throws NoSuchAlgorithmException, 
                                            NoSuchProviderException 

      返回实现指定摘要算法的 MessageDigest 对象。 

      algorithm - 所请求算法的名称 

      provider - 提供者的名称。 

    3、public void update(byte[] input) 

      使用指定的 byte 数组更新摘要。 

    4、public byte[] digest() 

      通过执行诸如填充之类的最终操作完成哈希计算。在调用此方法之后,摘要被重置。 

    5、public static boolean isEqual(byte[] digesta, 
                                  byte[] digestb) 

        比较两个摘要的相等性。做简单的字节比较。 


    注意:Provider可以通过 java.security.Security.getProviders() 方法获取已注册提供者列表。比较常用的有“SUN” 

    SUN提供的常用的算法名称有:MD2 
    MD5 
                            SHA-1 
                            SHA-256 
                            SHA-384 
                            SHA-512 

    Code举例: 

    import java.security.*; 
    public class myDigest { 
      public static void main(String[] args)  { 
        myDigest my=new myDigest(); 
        my.testDigest(); 
      } 
      public void testDigest() 
      { 
       try { 
         String myinfo="我的测试信息"; 
        //java.security.MessageDigest alg=java.security.MessageDigest.getInstance("MD5"); 
          java.security.MessageDigest alga=java.security.MessageDigest.getInstance("SHA-1"); 
          alga.update(myinfo.getBytes()); 
          byte[] digesta=alga.digest(); 
          System.out.println("本信息摘要是:"+byte2hex(digesta)); 
          //通过某中方式传给其他人你的信息(myinfo)和摘要(digesta) 对方可以判断是否更改或传输正常 
          java.security.MessageDigest algb=java.security.MessageDigest.getInstance("SHA-1"); 
          algb.update(myinfo.getBytes()); 
          if (algb.isEqual(digesta,algb.digest())) { 
             System.out.println("信息检查正常"); 
           } 
           else 
            { 
              System.out.println("摘要不相同"); 
             } 
       } 
       catch (java.security.NoSuchAlgorithmException ex) { 
         System.out.println("非法摘要算法"); 
       } 
      } 
      public String byte2hex(byte[] b) //二行制转字符串 
        { 
         String hs=""; 
         String stmp=""; 
         for (int n=0;n<b.length;n++) 
          { 
           stmp=(java.lang.Integer.toHexString(b[n] & 0XFF)); 
           if (stmp.length()==1) hs=hs+"0"+stmp; 
           else hs=hs+stmp; 
           if (n<b.length-1)  hs=hs+":"; 
          } 
         return hs.toUpperCase(); 
        } 
    } 

    关于Java加密的更多信息:http://www.ibm.com/developerworks/cn/java/l-security/

    --------------------------------------------------------------------------

    本系列Heritrix 3.1.0 源码解析系本人原创

    转载请注明出处 博客园 刺猬的温驯

    本文链接 http://www.cnblogs.com/chenying99/archive/2013/04/30/3052411.html

  • 相关阅读:
    冲刺周期第七天
    软件体系架构课下作业01
    大型网站技术架构-核心原理与案例分析-阅读笔记6
    大型网站技术架构-核心原理与案例分析-阅读笔记5
    大型网站技术架构-核心原理与案例分析-阅读笔记4
    大型网站技术架构-核心原理与案例分析-阅读笔记3
    大型网站技术架构-核心原理与案例分析-阅读笔记02
    《大型网站技术架构核心原理与案例分析》阅读笔记-01
    掌握需求过程阅读笔记—3
    掌握需求过程阅读笔记—2
  • 原文地址:https://www.cnblogs.com/chenying99/p/3052411.html
Copyright © 2011-2022 走看看