zoukankan      html  css  js  c++  java
  • 大三上寒假15天--第15天

    今天webmaigic爬虫又学了一个小技巧,想要自己设计保存爬取内容形式,可以不用重写Pipeline,在process()方法中写上,你想要的保存操作,多数情况可以达到相同的效果,我的爬虫程序,想要将内容保存在一个txt中,就是这么实现的,个人感觉简单很多,也是看了网上的高手的文章,才学到了这个技巧,受益匪浅。

    爬虫北京政府信件到此就完成完成了,全部代码如下,我的保存特点为以空格隔开不同的信息,方便导入数据库:

    package my.webmagic2;
    
    
    
    import java.io.File;
    import java.io.FileWriter;
    import java.io.IOException;
    
    import us.codecraft.webmagic.Page;
    import us.codecraft.webmagic.Request;
    import us.codecraft.webmagic.Site;
    import us.codecraft.webmagic.Spider;
    import us.codecraft.webmagic.model.HttpRequestBody;
    import us.codecraft.webmagic.pipeline.FilePipeline;
    import us.codecraft.webmagic.processor.PageProcessor;
    import us.codecraft.webmagic.scheduler.FileCacheQueueScheduler;
    import us.codecraft.webmagic.utils.HttpConstant;
    
    public class text implements PageProcessor{
        public static int h=1;
        private Site site=Site.me().setRetrySleepTime(3).setSleepTime(100);
        public int check=0;
        /**
         * @param args
         */
        public Site getSite() {
            // TODO Auto-generated method stub
            return site;
        }
        public void process(Page page) {
            // TODO Auto-generated method stub
            if(check==0){
                check++;
                String[] str1=page.getHtml().regex(""letter_type":"[^,]+").all().toString().split(",");
                String[] str2=page.getHtml().regex(""original_id":"[^,]+").all().toString().split(",");
                int len1,len2;
                for(int i=0;i<str1.length-1;i++){
                    len1=str1[i].length()-1;
                    str1[i]=str1[i].substring(16,len1);
                    len2=str2[i].length()-1;
                    str2[i]=str2[i].substring(16,len2);
                }
                str1[str1.length-1]= str1[str1.length-1].substring(16,str1[str1.length-1].length()-2);
                str2[str2.length-1]= str2[str2.length-1].substring(16,str2[str2.length-1].length()-2);
                for(int i=0;i<str2.length;i++){
                    if(str1[i].equals("咨询")){
                        page.addTargetRequest("http://www.beijing.gov.cn/hudong/hdjl/com.web.consult.consultDetail.flow?originalId="+str2[i]);
                     }
                     else if(str1[i].equals("建议")){
                         page.addTargetRequest("http://www.beijing.gov.cn/hudong/hdjl/com.web.suggest.suggesDetail.flow?originalId="+str2[i]);
                     }
                     else if(str1[i].equals("投诉")){
                         page.addTargetRequest("http://www.beijing.gov.cn/hudong/hdjl/com.web.complain.complainDetail.flow?originalId="+str2[i]);
                     }
                     else{
                         page.addTargetRequest("http://www.beijing.gov.cn/hudong/hdjl/com.web.complain.complainDetail.flow?originalId="+str2[i]);
                     }
                }
            }
            else{
                File file=new File("/home/hadoop/xinjian");
                try {
                    FileWriter w=new FileWriter(file,true);
                if(page.getHtml().xpath("/html/body/div[2]/div/div[2]/div[1]/div[1]/div[2]/strong").toString()!=null){
                    String hf=page.getHtml().xpath("/html/body/div[2]/div/div[2]/div[2]/div/div[1]/div[2]").toString();
                    hf=hf.replace("<div class="col-xs-12 col-md-12 column p-4 text-muted my-3">","" );
                    hf=hf.replace("</div>", "");
                    hf=hf.replaceAll("&nbsp;", "");
                    hf=hf.replaceAll("<p>", "");
                    hf=hf.replaceAll("</p>","");
                    hf=hf.replaceAll(" ", "");
                    hf=hf.replaceAll("
    ", "");
                    w.write(page.getHtml().xpath("/html/body/div[2]/div/div[2]/div[1]/div[1]/div[2]/strong/text()").toString().replaceAll(" ","")
                            +" "
                            +page.getHtml().xpath("/html/body/div[2]/div/div[2]/div[1]/div[2]/div[1]/text()").toString().substring(4).replaceAll(" ","")
                            +" "
                            +page.getHtml().xpath("/html/body/div[2]/div/div[2]/div[1]/div[2]/div[2]/text()").toString().substring(3).replaceAll(" ","")
                            +" "
                            +page.getHtml().xpath("/html/body/div[2]/div/div[2]/div[1]/div[2]/div[3]/label/text()").toString().replaceAll(" ","")
                            +" "
                            +page.getHtml().xpath("/html/body/div[2]/div/div[2]/div[1]/div[3]/text()").toString().replaceAll(" ","")
                            +" "
                            +page.getHtml().xpath("/html/body/div[2]/div/div[2]/div[2]/div/div[1]/div[1]/div[2]/text()").toString().replaceAll(" ","")
                            +" "
                            +page.getHtml().xpath("/html/body/div[2]/div/div[2]/div[2]/div/div[1]/div[1]/div[3]/text()").toString().substring(5).replaceAll(" ","")
                            +" "
                            +hf
                            +"
    "
                                );
                    w.close();
                    h++;
                }else if(page.getHtml().xpath("/html/body/div[2]/div/div[2]/form/div[1]/div[1]/div[2]/strong").toString()!=null){
                    String hf=page.getHtml().xpath("/html/body/div[2]/div/div[2]/form/div[2]/div/div[1]/div[2]").toString();
                    hf=hf.replace("<div class="col-xs-12 col-md-12 column p-4 text-muted my-3">","" );
                    hf=hf.replace("</div>", "");
                    hf=hf.replaceAll("&nbsp;", " ");
                    hf=hf.replaceAll("<p>", "");
                    hf=hf.replaceAll("</p>","");
                    hf=hf.replaceAll(" ", "");
                    hf=hf.replaceAll("
    ", "");
                    w.write(
                            page.getHtml().xpath("/html/body/div[2]/div/div[2]/form/div[1]/div[1]/div[2]/strong/text()").toString().replaceAll(" ","")
                            +" "
                            +page.getHtml().xpath("/html/body/div[2]/div/div[2]/form/div[1]/div[2]/div[1]/text()").toString().substring(4).replaceAll(" ","")
                            +" "
                            +page.getHtml().xpath("/html/body/div[2]/div/div[2]/form/div[1]/div[2]/div[2]/text()").toString().substring(3).replaceAll(" ","")
                            +" "
                            +page.getHtml().xpath("/html/body/div[2]/div/div[2]/form/div[1]/div[2]/div[3]/label/text()").toString().replaceAll(" ","")
                            +" "
                            +page.getHtml().xpath("/html/body/div[2]/div/div[2]/form/div[1]/div[3]/text()").toString().replaceAll(" ","")
                            +" "
                            +page.getHtml().xpath("/html/body/div[2]/div/div[2]/form/div[2]/div/div[1]/div[1]/div[2]/text()").toString().replaceAll(" ","")
                            +" "
                            +page.getHtml().xpath("/html/body/div[2]/div/div[2]/form/div[2]/div/div[1]/div[1]/div[3]/text()").toString().substring(5).replaceAll(" ","")
                            +" "
                            +hf
                            +"
    "
                                );
                    w.close();
                    h++;
                }else{
                    page.putField("all", page.getHtml().toString());
                    w.close();
                }
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
                
        }
    
        /**
         * @param args
         */
        public static void main(String[] args) {
            int j=0;
            for(int i=0;i<=5586;i++){
                j=i*6;
                // TODO Auto-generated method stub
                Request request = new Request("http://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.replyMailList.biz.ext");
                request.setMethod(HttpConstant.Method.POST);
                request.setRequestBody(HttpRequestBody.json("{'PageCond/begin':"+j+",'PageCond/length':6,'PageCond/isCount':'true','keywords':'','orgids':'','startDate':'','endDate':'','letterType':'2','letterStatue':''}","utf-8"));
                Spider.create(new text())
                .addRequest(request)
    .addPipeline(new FilePipeline("./xinjian/")) .setScheduler(
    new FileCacheQueueScheduler("./xinjian/")) .thread(5) .run(); System.out.println("完成"+i); } System.out.println("全部完成"); } }
  • 相关阅读:
    分布式设计与开发(一)------宏观概述
    分布式设计与开发(二)------几种必须了解的分布式算法
    分布式设计与开发(三)------高一致性服务ZooKeeper
    jvm工作原理
    JVM原理和优化
    分布式系统设计原理与方案
    在Linux上安装Memcached服务
    使用Memcached、Spring AOP构建数据库前端缓存框架
    Java使用memcached
    Memcache的部署和使用
  • 原文地址:https://www.cnblogs.com/my---world/p/12313824.html
Copyright © 2011-2022 走看看