zoukankan      html  css  js  c++  java
  • 【Heritrix基础教程之4】开始一个爬虫抓取的全流程代码分析


    在创建一个job后,就要开始job的运行,运行的全流程如下:

    1、在界面上启动job


    2、index.jsp

    查看上述页面对应的源代码

    <a href='"+request.getContextPath()+"/console/action.jsp?action=start'>Start</a>

    3、action.jsp


        String sAction = request.getParameter("action");
        if(sAction != null)
        {
            // Need to handle an action    
            if(sAction.equalsIgnoreCase("start"))
            {
                // Tell handler to start crawl job
                handler.startCrawler();
            } else if(sAction.equalsIgnoreCase("stop")) {
                // Tell handler to stop crawl job
                handler.stopCrawler();
            } else if(sAction.equalsIgnoreCase("terminate")) {
                // Delete current job
                if(handler.getCurrentJob()!=null){
                    handler.deleteJob(handler.getCurrentJob().getUID());
                }
            } else if(sAction.equalsIgnoreCase("pause")) {
                // Tell handler to pause crawl job
                handler.pauseJob();
            } else if(sAction.equalsIgnoreCase("resume")) {
                // Tell handler to resume crawl job
                handler.resumeJob();
            } else if(sAction.equalsIgnoreCase("checkpoint")) {
                if(handler.getCurrentJob() != null) {
                    handler.checkpointJob();
                }
            }
        }    
        response.sendRedirect(request.getContextPath() + "/index.jsp");

    4、CrawlJobHandler.jsp

    (1)

        public void startCrawler() {
            running = true;
            if (pendingCrawlJobs.size() > 0 && isCrawling() == false) {
                // Ok, can just start the next job
                startNextJob();
            }
        }

    (2)

        protected final void startNextJob() {
            synchronized (this) {
                if(startingNextJob != null) {
                    try {
                        startingNextJob.join();
                    } catch (InterruptedException e) {
                        e.printStackTrace();
                        return;
                    }
                }
                startingNextJob = new Thread(new Runnable() {
                    public void run() {
                        startNextJobInternal();
                    }
                }, "StartNextJob");
                startingNextJob.start();
            }
        }

    (3)

       protected void startNextJobInternal() {
            if (pendingCrawlJobs.size() == 0 || isCrawling()) {
                // No job ready or already crawling.
                return;
            }
            this.currentJob = (CrawlJob)pendingCrawlJobs.first();
            assert pendingCrawlJobs.contains(currentJob) :
                "pendingCrawlJobs is in an illegal state";
            pendingCrawlJobs.remove(currentJob);
            try {
                this.currentJob.setupForCrawlStart();
                // This is ugly but needed so I can clear the currentJob
                // reference in the crawlEnding and update the list of completed
                // jobs.  Also, crawlEnded can startup next job.
                this.currentJob.getController().addCrawlStatusListener(this);
                // now, actually start
                this.currentJob.getController().requestCrawlStart();
            } catch (InitializationException e) {
                loadJob(getStateJobFile(this.currentJob.getDirectory()));
                this.currentJob = null;
                startNextJobInternal(); // Load the next job if there is one.
            }
        }

    (4)

        public void requestCrawlStart() {
            runProcessorInitialTasks();
    
            sendCrawlStateChangeEvent(STARTED, CrawlJob.STATUS_PENDING);
            String jobState;
            state = RUNNING;
            jobState = CrawlJob.STATUS_RUNNING;
            sendCrawlStateChangeEvent(this.state, jobState);
    
            // A proper exit will change this value.
            this.sExit = CrawlJob.STATUS_FINISHED_ABNORMAL;
            
            Thread statLogger = new Thread(statistics);
            statLogger.setName("StatLogger");
            statLogger.start();
            
            frontier.start();
        }



  • 相关阅读:
    FZU 2098 刻苦的小芳(卡特兰数,动态规划)
    卡特兰数总结
    FZU 1064 教授的测试(卡特兰数,递归)
    HDU 4745 Two Rabbits(区间DP,最长非连续回文子串)
    Java 第十一届 蓝桥杯 省模拟赛 正整数的摆动序列
    Java 第十一届 蓝桥杯 省模拟赛 反倍数
    Java 第十一届 蓝桥杯 省模拟赛 反倍数
    Java 第十一届 蓝桥杯 省模拟赛 反倍数
    Java 第十一届 蓝桥杯 省模拟赛 凯撒密码加密
    Java 第十一届 蓝桥杯 省模拟赛 凯撒密码加密
  • 原文地址:https://www.cnblogs.com/jediael/p/4304128.html
Copyright © 2011-2022 走看看