zoukankan      html  css  js  c++  java
  • Heritrix 3.1.0 源码解析(二十七)

    上文分析了Heritrix3.1.0系统的对请求认证机制的封装,本文接下来分析Heritrix3.1.0系统对cookies的处理的封装

    Heritrix3.1.0系统提供了CookieStorage接口,用于提供cookies的存储

    CookieStorage接口很简单,声明了保存cookies对象的Map容器的方法和获取cookies对象的Map容器的方法

    public interface CookieStorage extends Lifecycle {
    
        SortedMap<String,Cookie> getCookiesMap();
    
        void saveCookiesMap(Map<String,Cookie> map);
    
    }

    抽象类AbstractCookieStorage实现了CookieStorage接口,用于为具体实现类提供公用模板 

    public abstract class AbstractCookieStorage 
        implements CookieStorage, 
                   Lifecycle, // InitializingBean, 
                   Closeable {
    
        final private static Logger LOGGER = 
            Logger.getLogger(AbstractCookieStorage.class.getName());
        //cookies配置文件(用于加载)
        protected ConfigFile cookiesLoadFile = null;
        public ConfigFile getCookiesLoadFile() {
            return cookiesLoadFile;
        }
        public void setCookiesLoadFile(ConfigFile cookiesLoadFile) {
            this.cookiesLoadFile = cookiesLoadFile;
        }
    
        //cookies文件路径(用于保存)
        protected ConfigPath cookiesSaveFile = null;
        public ConfigPath getCookiesSaveFile() {
            return cookiesSaveFile;
        }
        public void setCookiesSaveFile(ConfigPath cookiesSaveFile) {
            this.cookiesSaveFile = cookiesSaveFile;
        }
    
        boolean isRunning = false; 
        /**
         * 初始化
         */
        @Override
        public void start() {
            if(isRunning()) {
                return;
            }
            SortedMap<String,Cookie> cookies = prepareMap();
            if (getCookiesLoadFile()!=null) {
                //从cookies配置文件加载cookies
                loadCookies(getCookiesLoadFile(), cookies);
            }
            isRunning = true; 
        }
        @Override
        public boolean isRunning() {
            return isRunning;
        }
        @Override
        public void stop() {
            isRunning = false; 
        }
        /**
         * 初始化SortedMap<String,Cookie> 由具体子类实现
         * @return
         */
        protected abstract SortedMap<String,Cookie> prepareMap();    
        
        /**
         * 从Reader reader对象加载cookies
         * @param reader
         * @param cookies
         */
        public static void loadCookies(Reader reader,
                SortedMap<String, Cookie> cookies) {
            BufferedReader br = new BufferedReader(reader);
            try {
                String line;
                int lineNo = 1;
                while ((line = br.readLine()) != null) {
                    if (!line.matches("\\s*(?:#.*)?")) { // skip blank links and comments
                        String[] tokens = line.split("\\t");
                        if (tokens.length == 7) {
                            long epochSeconds = Long.parseLong(tokens[4]);
                            Date expirationDate = (epochSeconds >= 0 ? new Date(epochSeconds * 1000) : null);
                            Cookie cookie = new Cookie(tokens[0], tokens[5],
                                    tokens[6], tokens[2], expirationDate, 
                                    Boolean.valueOf(tokens[3]).booleanValue());
                            cookie.setDomainAttributeSpecified(Boolean.valueOf(tokens[1]).booleanValue());
                            
                            LOGGER.fine("Adding cookie: domain " + cookie.getDomain() + " cookie " + cookie.toExternalForm());
                            cookies.put(cookie.getSortKey(), cookie);
                        } else {
                            LOGGER.warning("cookies input line " + lineNo + " invalid, expected 7 tab-delimited tokens");
                        }
                    }
                    
                    lineNo++;
                }
            } catch (IOException e) {
                LOGGER.log(Level.WARNING,e.getMessage(), e);
            }
        }
        /**
         * 从配置文件加载SortedMap<String, Cookie> cookies
         * @param file
         * @param cookies
         */
        protected static void loadCookies(ConfigFile file,
                SortedMap<String, Cookie> cookies) {
            
            Reader reader = null;
            try {
                reader = file.obtainReader();
                loadCookies(reader, cookies);
            } finally {
                IOUtils.closeQuietly(reader);
            }
        }
    
        public static void loadCookies(String cookiesFile, 
                SortedMap<String,Cookie> result) {
    
            // Do nothing if cookiesFile is not specified.
            if (cookiesFile == null || cookiesFile.length() <= 0) {
                return;
            }
            
            FileReader reader = null;
            try {
                reader = new FileReader(cookiesFile);
                loadCookies(reader, result);
            } catch (FileNotFoundException e) {
                LOGGER.log(Level.WARNING,"Could not find file: " + cookiesFile, e);
            } finally {
                IOUtils.closeQuietly(reader);
            }
        }
        /**
         * 保存map容器中的cookies到文件
         * @param saveCookiesFile
         * @param cookies
         */
        public static void saveCookies(String saveCookiesFile, Map<String,Cookie> cookies) { 
            // Do nothing if cookiesFile is not specified. 
            if (saveCookiesFile == null || saveCookiesFile.length() <= 0) { 
                return; 
            }
          
            FileOutputStream out = null; 
            try { 
                out = new FileOutputStream(new File(saveCookiesFile)); 
                String tab ="\t"; 
                out.write("# Heritrix Cookie File\n".getBytes()); 
                out.write("# This file is the Netscape cookies.txt format\n\n".getBytes()); 
                for (Cookie cookie: cookies.values()) { 
                    // Guess an initial size 
                    MutableString line = new MutableString(1024 * 2); 
                    line.append(cookie.getDomain()); 
                    line.append(tab);
                    line.append(cookie.isDomainAttributeSpecified() ? "TRUE" : "FALSE"); 
                    line.append(tab); 
                    line.append(cookie.getPath());
                    line.append(tab); 
                    line.append(cookie.getSecure() ? "TRUE" : "FALSE"); 
                    line.append(tab);
                    line.append(cookie.getExpiryDate() != null ? cookie.getExpiryDate().getTime() / 1000 : -1);
                    line.append(tab);
                    line.append(cookie.getName());
                    line.append(tab);                
                    line.append(cookie.getValue() != null ? cookie.getValue() : ""); 
                    line.append("\n");
                    out.write(line.toString().getBytes()); 
                } 
            } catch (IOException e) {
                LOGGER.log(Level.SEVERE, "Unable to write " + saveCookiesFile, e);
            } finally {
                IOUtils.closeQuietly(out);
            } 
        }
        /**
         * 具体子类实现
         */
        @Override
        public abstract SortedMap<String,Cookie> getCookiesMap();
        /**
         * 保存map容器中的cookies
         */
        @Override
        public void saveCookiesMap(Map<String, Cookie> map) {
            //抽象方法由具体子类实现
            innerSaveCookiesMap(map);
            if (getCookiesSaveFile()!=null) {
                saveCookies(getCookiesSaveFile().getFile().getAbsolutePath(), map);
            }
        }
        /**
         * 具体子类实现
         * @param map
         */
        protected abstract void innerSaveCookiesMap(Map<String,Cookie> map);
        @Override
        public void close() throws IOException {
        }
    
    }

    Heritrix3.1.0提供了两个继承类,分别为BdbCookieStorage和SimpleCookieStorage,前者将cookies保存在BDB数据库,后者保存在Map对象里面

    BdbCookieStorage类的相关方法如下

    protected BdbModule bdb;
        @Autowired
        public void setBdbModule(BdbModule bdb) {
            this.bdb = bdb;
        }
        
        /** are we a checkpoint recovery? (in which case, reuse stored cookie data?) */
        boolean isCheckpointRecovery = false; 
        
        public static String COOKIEDB_NAME = "http_cookies";
     
        private transient Database cookieDb;
        private transient StoredSortedMap<String,Cookie> cookies;
    
        public BdbCookieStorage() {
        }
    
        protected SortedMap<String,Cookie> prepareMap() {
            try {
                StoredClassCatalog classCatalog = bdb.getClassCatalog();
                BdbModule.BdbConfig dbConfig = new BdbModule.BdbConfig();
                dbConfig.setTransactional(false);
                dbConfig.setAllowCreate(true);
                cookieDb = bdb.openDatabase(COOKIEDB_NAME, dbConfig, isCheckpointRecovery);
                cookies = 
                    new StoredSortedMap<String,Cookie>(
                        cookieDb,
                        new StringBinding(), 
                        new SerialBinding<Cookie>(classCatalog,Cookie.class), 
                        true);
                return cookies;
            } catch (DatabaseException e) {
                throw new RuntimeException(e);
            }
        }
    
        public SortedMap<String, Cookie> getCookiesMap() {
    //        assert cookies != null : "cookie map not set up";
            return cookies;
        }
    
        protected void innerSaveCookiesMap(Map<String, Cookie> map) {
        }

    SimpleCookieStorage类与之类似,不在这里贴出来了

    这里需要注意的是,Heritrix3.1.0系统改写了HttpClient组件的Cookie类,逻辑与HttpClient组件的Cookie类类似

    那么Heritrix3.1.0系统怎样将CookieStorage接口实现类获取的SortedMap<String, Cookie>容器中的Cookies添加在HttpClient组件的相关对象呢?

    Heritrix3.1.0系统还改写了HttpClient组件的HttpState类,添加了设置SortedMap cookiesMap对象的方法,相关方法如下

    private SortedMap cookiesMap = new ConcurrentSkipListMap();
    // START IA/HERITRIX ADDITIONS
        /**
         * Returns a sorted map of {@link Cookie cookies} that this HTTP
         * state currently contains.
         * 
         * Any operations on this map should be synchronized with respect 
         * to this HttpState instance.
         * 
         * @return sorter map of {@link Cookie cookies}
         */
        public SortedMap getCookiesMap() {
            return cookiesMap;
        }
        
        /**
         * Replace the standard sorted map with an external implemenations 
         * (such as one backed by persistent store, like BDB's StoredSortedMap.)
         * 
         * @param map alternate sorted map to use to store cookies
         */
        public void setCookiesMap(SortedMap map) {
            this.cookiesMap = map;
        }
    // END IA/HERITRIX ADDITIONS

    同时HttpMethodBase对象相关方法里面从HttpState state对象获取Cookies对象也做了相应的改写 

    /**
         * Generates <tt>Cookie</tt> request headers for those {@link Cookie cookie}s
         * that match the given host, port and path.
         *
         * @param state the {@link HttpState state} information associated with this method
         * @param conn the {@link HttpConnection connection} used to execute
         *        this HTTP method
         *
         * @throws IOException if an I/O (transport) error occurs. Some transport exceptions
         *                     can be recovered from.
         * @throws HttpException  if a protocol exception occurs. Usually protocol exceptions 
         *                    cannot be recovered from.
         */
        protected void addCookieRequestHeader(HttpState state, HttpConnection conn)
            throws IOException, HttpException {
    
            LOG.trace("enter HttpMethodBase.addCookieRequestHeader(HttpState, "
                      + "HttpConnection)");
    
            Header[] cookieheaders = getRequestHeaderGroup().getHeaders("Cookie");
            for (int i = 0; i < cookieheaders.length; i++) {
                Header cookieheader = cookieheaders[i];
                if (cookieheader.isAutogenerated()) {
                    getRequestHeaderGroup().removeHeader(cookieheader);
                }
            }
    
            CookieSpec matcher = getCookieSpec(state);
            String host = this.params.getVirtualHost();
            if (host == null) {
                host = conn.getHost();
            }
            // BEGIN IA/HERITRIX CHANGES
            Cookie[] cookies = matcher.match(host, conn.getPort(),
                getPath(), conn.isSecure(), state.getCookiesMap());
            // END IA/HERITRIX CHANGES
            if ((cookies != null) && (cookies.length > 0)) {
                if (getParams().isParameterTrue(HttpMethodParams.SINGLE_COOKIE_HEADER)) {
                    // In strict mode put all cookies on the same header
                    String s = matcher.formatCookies(cookies);
                    getRequestHeaderGroup().addHeader(new Header("Cookie", s, true));
                } else {
                    // In non-strict mode put each cookie on a separate header
                    for (int i = 0; i < cookies.length; i++) {
                        String s = matcher.formatCookie(cookies[i]);
                        getRequestHeaderGroup().addHeader(new Header("Cookie", s, true));
                    }
                }
            }
        }

    最后我们怎样在配置文件crawler-beans.cxml配置cookie文件呢,本人做了一个示例

     <!-- BDBCOOKIESTORAGE: disk-based cookie storage for FetchHTTP -->
     <bean id="cookieStorage" 
       class="org.archive.modules.fetcher.BdbCookieStorage">
      <property name="cookiesLoadFile"><ref bean="cookieInit"/></property> 
     <property name="cookiesSaveFile"><ref bean="cookieSave"/></property>
      <property name="bdb">
            <ref bean="bdb"/>
           </property>
     </bean>
     <bean id="cookieInit" class="org.archive.spring.ConfigFile">
        <property name="name" value="cookie.txt" />
        <property name="path" value="/root/stpl/cookie.txt" />
    </bean>
    <bean id="cookieSave" class="org.archive.spring.ConfigPath">
        <property name="name" value="cookies_dump.txt" />
        <property name="path" value="/root/stpl/cookies_dump.txt" />
    </bean>

    cookie.txt文件格式可以参考这段英文注释,这段注释你懂的

    * format. Example entry of cookies.txt file:
         * <p>
         * www.archive.org FALSE / FALSE 1311699995 details-visit texts-cralond
         * </p>
         * <p>
         * Each line has 7 tab-separated fields:
         * </p>
         * <ol>
         * <li>DOMAIN: The domain that created and have access to the cookie value.</li>
         * <li>FLAG: A TRUE or FALSE value indicating if hosts within the given
         * domain can access the cookie value.</li>
         * <li>PATH: The path within the domain that the cookie value is valid for.</li>
         * <li>SECURE: A TRUE or FALSE value indicating if to use a secure
         * connection to access the cookie value.</li>
         * <li>EXPIRATION: The expiration time of the cookie value, or -1 for no
         * expiration</li>
         * <li>NAME: The name of the cookie value</li>
         * <li>VALUE: The cookie value</li>
         * </ol>

    ---------------------------------------------------------------------------

    本系列Heritrix 3.1.0 源码解析系本人原创

    转载请注明出处 博客园 刺猬的温驯

    本文链接 http://www.cnblogs.com/chenying99/archive/2013/04/28/3049673.html

  • 相关阅读:
    等待通知--wait notify
    表单重复提交与解决
    Cookie Session 与Token
    springMVC实现登陆
    第11章 AOF持久化
    第10章 RDB持久化
    MyBatis动态SQL
    第4章 网络层
    第9章 数据库
    代理设计模式
  • 原文地址:https://www.cnblogs.com/chenying99/p/3049673.html
Copyright © 2011-2022 走看看