zoukankan      html  css  js  c++  java
  • Heritrix 3.1.0 源码解析(十七)

    我们接下来分析与与BdbFrontier对象void finished(CrawlURI cURI)方法相关的方法 

    /**
         * Note that the previously emitted CrawlURI has completed
         * its processing (for now).
         *
         * The CrawlURI may be scheduled to retry, if appropriate,
         * and other related URIs may become eligible for release
         * via the next next() call, as a result of finished().
         *
         * TODO: make as many decisions about what happens to the CrawlURI
         * (success, failure, retry) and queue (retire, snooze, ready) as 
         * possible elsewhere, such as in DispositionProcessor. Then, break
         * this into simple branches or focused methods for each case. 
         *  
         * @see org.archive.crawler.framework.Frontier#finished(org.archive.modules.CrawlURI)
         */
        protected void processFinish(CrawlURI curi) {
    //        assert Thread.currentThread() == managerThread;        
            long now = System.currentTimeMillis();
            //尝试次数
            curi.incrementFetchAttempts();
            logNonfatalErrors(curi);
            
            WorkQueue wq = (WorkQueue) curi.getHolder();
            // always refresh budgeting values from current curi
            // (whose overlay settings should be active here)
            wq.setSessionBudget(getBalanceReplenishAmount());
            wq.setTotalBudget(getQueueTotalBudget());
            
            assert (wq.peek(this) == curi) : "unexpected peek " + wq;
    
            int holderCost = curi.getHolderCost();
            //是否需要重新处理
            if (needsReenqueuing(curi)) {
                // codes/errors which don't consume the URI, leaving it atop queue
                if(curi.getFetchStatus()!=S_DEFERRED) {
                    wq.expend(holderCost); // all retries but DEFERRED cost
                }
                //延时时间
                long delay_ms = retryDelayFor(curi) * 1000;
                curi.processingCleanup(); // lose state that shouldn't burden retry
                wq.unpeek(curi);
                //更新到WorkQueue wq
                wq.update(this, curi); // rewrite any changes
                //重新归队
                handleQueue(wq,curi.includesRetireDirective(),now,delay_ms);
                appCtx.publishEvent(new CrawlURIDispositionEvent(this,curi,DEFERRED_FOR_RETRY));
                doJournalReenqueued(curi);
                wq.makeDirty();
                return; // no further dequeueing, logging, rescheduling to occur
            }
    
            // Curi will definitely be disposed of without retry, so remove from queue
            //从WorkQueue wq中移除该CrawlURI curi对象
            wq.dequeue(this,curi);
            decrementQueuedCount(1);
            largestQueues.update(wq.getClassKey(), wq.getCount());
            log(curi);
            
            if (curi.isSuccess()) {
                // codes deemed 'success' 
                incrementSucceededFetchCount();
                totalProcessedBytes.addAndGet(curi.getRecordedSize());
                appCtx.publishEvent(new CrawlURIDispositionEvent(this,curi,SUCCEEDED));
                doJournalFinishedSuccess(curi);
               
            } else if (isDisregarded(curi)) {
                // codes meaning 'undo' (even though URI was enqueued, 
                // we now want to disregard it from normal success/failure tallies)
                // (eg robots-excluded, operator-changed-scope, etc)
                incrementDisregardedUriCount();
                appCtx.publishEvent(new CrawlURIDispositionEvent(this,curi,DISREGARDED));
                holderCost = 0; // no charge for disregarded URIs
                // TODO: consider reinstating forget-URI capability, so URI could be
                // re-enqueued if discovered again
                doJournalDisregarded(curi);
                
            } else {
                // codes meaning 'failure'
                incrementFailedFetchCount();
                appCtx.publishEvent(new CrawlURIDispositionEvent(this,curi,FAILED));
                // if exception, also send to crawlErrors
                if (curi.getFetchStatus() == S_RUNTIME_EXCEPTION) {
                    Object[] array = { curi };
                    loggerModule.getRuntimeErrors().log(Level.WARNING, curi.getUURI()
                            .toString(), array);
                }        
                // charge queue any extra error penalty
                wq.noteError(getErrorPenaltyAmount());
                doJournalFinishedFailure(curi);
                
            }
    
            wq.expend(holderCost); // successes & failures charge cost to queue
            //延时时间
            long delay_ms = curi.getPolitenessDelay();
            //long delay_ms = 0;
            //重新归队
            handleQueue(wq,curi.includesRetireDirective(),now,delay_ms);
            wq.makeDirty();
            
            if(curi.getRescheduleTime()>0) {
                // marked up for forced-revisit at a set time
                curi.processingCleanup();
                curi.resetForRescheduling(); 
                futureUris.put(curi.getRescheduleTime(),curi);
                futureUriCount.incrementAndGet(); 
            } else {
                curi.stripToMinimal();
                curi.processingCleanup();
            }
        }

    首先判断CrawlURI curi对象是否需要重新放入队列,方法如下

    /**
         * Checks if a recently processed CrawlURI that did not finish successfully
         * needs to be reenqueued (and thus possibly, processed again after some 
         * time elapses)
         * 
         * @param curi
         *            The CrawlURI to check
         * @return True if we need to retry.
         */
        protected boolean needsReenqueuing(CrawlURI curi) {
            //是否超过最大的尝试次数,默认为30次
            if (overMaxRetries(curi)) {
                return false;
            }
            //根据状态判断
            switch (curi.getFetchStatus()) {
            case HttpStatus.SC_UNAUTHORIZED:
                // We can get here though usually a positive status code is
                // a success. We get here if there is rfc2617 credential data
                // loaded and we're supposed to go around again. See if any
                // rfc2617 credential present and if there, assume it got
                // loaded in FetchHTTP on expectation that we're to go around
                // again. If no rfc2617 loaded, we should not be here.
                boolean loaded = curi.hasRfc2617Credential();
                if (!loaded && logger.isLoggable(Level.FINE)) {
                    logger.fine("Have 401 but no creds loaded " + curi);
                }
                return loaded;
            case S_DEFERRED:
            case S_CONNECT_FAILED:
            case S_CONNECT_LOST:
            case S_DOMAIN_UNRESOLVABLE:
                // these are all worth a retry
                // TODO: consider if any others (S_TIMEOUT in some cases?) deserve
                // retry
                return true;
            case S_UNATTEMPTED:
                if(curi.includesRetireDirective()) {
                    return true;
                } // otherwise, fall-through: no status is an error without queue-directive
            default:
                return false;
            }
        }

     long retryDelayFor(CrawlURI curi)方法为设置WorkQueue wq延时时间 

    /**
         * Return a suitable value to wait before retrying the given URI.
         * 
         * @param curi
         *            CrawlURI to be retried
         * @return millisecond delay before retry
         */
        protected long retryDelayFor(CrawlURI curi) {
            int status = curi.getFetchStatus();
            return (status == S_CONNECT_FAILED || status == S_CONNECT_LOST ||
                    status == S_DOMAIN_UNRESOLVABLE)? getRetryDelaySeconds() : 0;
                    // no delay for most
        }

    getRetryDelaySeconds()的值默认为900秒(15分)

    后面为将CrawlURI curi对象更新到WorkQueue wq,最后 重置WorkQueue wq的队列归属(放入不再激活的队列或休眠队列或reenqueueQueue(wq)进一步处理

    /**
         * 重置WorkQueue wq的队列归属
         * Send an active queue to its next state, based on the supplied 
         * parameters.
         * 
         * @param wq
         * @param forceRetire
         * @param now
         * @param delay_ms
         */
        protected void handleQueue(WorkQueue wq, boolean forceRetire, long now, long delay_ms) {
            
            inProcessQueues.remove(wq);
            if(forceRetire) {
                retireQueue(wq);
            } else if (delay_ms > 0) {
                snoozeQueue(wq, now, delay_ms);
            } else {
                //Enqueue the given queue to either readyClassQueues or inactiveQueues,as appropriate
                reenqueueQueue(wq);
            }
        }

    接下来看后面的方法wq.dequeue(this,curi)为将CrawlURI curi对象从WorkQueue wq中移除

    最后重置WorkQueue wq的队列归属

     long delay_ms = curi.getPolitenessDelay(); 
     handleQueue(wq,curi.includesRetireDirective(),now,delay_ms);

    handleQueue方法在上面部分

    ---------------------------------------------------------------------------

    本系列Heritrix 3.1.0 源码解析系本人原创

    转载请注明出处 博客园 刺猬的温驯

    本文链接 http://www.cnblogs.com/chenying99/archive/2013/04/21/3033520.html 

  • 相关阅读:
    【LeetCode】Validate Binary Search Tree
    【LeetCode】Search in Rotated Sorted Array II(转)
    【LeetCode】Search in Rotated Sorted Array
    【LeetCode】Set Matrix Zeroes
    【LeetCode】Sqrt(x) (转载)
    【LeetCode】Integer to Roman
    贪心算法
    【LeetCode】Best Time to Buy and Sell Stock III
    【LeetCode】Best Time to Buy and Sell Stock II
    CentOS 6 上安装 pip、setuptools
  • 原文地址:https://www.cnblogs.com/chenying99/p/3033520.html
Copyright © 2011-2022 走看看