zoukankan      html  css  js  c++  java
  • 利用httpclient+jericho多线程实现抓取网页内容

     任务描述:
    某图书网站按条件查询得出一页20条记录,每条记录有书目的简要信息和url链接到书的详细信息。
    需要抓取网站图书的详细信息,保存到本地数据库中。

    任务分析:
    用httpclient模拟执行url将网站的信息取回,再用jericho包,分析页面元素,将需要的信息取出,保存到数据库中。
    因为数据量比较大,还是采用多线程的方式来执行抓取详细页面,分析获得数据。

    处理过程:
    按条件查询到图书列表信息后,主线程不停的下翻页面,分析每本图书的详细url,将url保存到一个ArrayList中,启多个子线程分别去抓取详细页面的信息。然后利用jericho包分析页面数据并保存到数据库中。

      

    代码实现截取如下:
     ......

    public class BookCatcher
    {
        private static ArrayList threads= new ArrayList();//存储未处理URL
        public static boolean isFinished=false;
       
      public String getUrl() {
       try {
        synchronized (threads) {
         if (threads.size() > 0) {
          String tmp = String.valueOf(threads.get(0));
          threads.remove(0);
          return tmp;
         } else
          return null;
        }
       } catch (Exception e) {
        return null;
       }
      }
        public void process(){
            //处理预处理
           //下面开10个线程等待处理
         new Thread(new Processer(this)).start();
         new Thread(new Processer(this)).start();
         new Thread(new Processer(this)).start();
         new Thread(new Processer(this)).start();
         new Thread(new Processer(this)).start();
         new Thread(new Processer(this)).start();
         new Thread(new Processer(this)).start();
         new Thread(new Processer(this)).start();
         new Thread(new Processer(this)).start();
         new Thread(new Processer(this)).start();

          
         ....   
         for(int j=0;j<pages;j++)//从第一页翻到最后一页
         {
        ...
        source = CommonUtil.getSourceByUrl(url);
        List<Element> elements = source.getAllElementsByClass("ProductTitle");  
        for (Element element : elements){ 
         String href = element.getContent().getFirstStartTag().getAttributeValue("href");  
          
         if (href!=null && !"".equals(href)){  
           synchronized (threads) {
            threads.add(bookurl);// 把URL存进去     
           }    
         } 

         }
         isFinished=true; //主线程处理完所有的url
    }

    class Processer implements Runnable
    {
        BookCatcher c;
        public Processer(BookCatcher c)
        {
            this.c = c;
        }
        public void run()
        {
            String bookUrl = null;
            while((bookUrl=c.getUrl())!=null || !BookCatcher.isFinished)  //当还有记录时就处理       
            {
                if(bookUrl!=null)
                {
              //处理分析页面数据并将数据保存到数据库
                  
           Source source = CommonUtil.getSourceByUrl(bookUrl);
           String tmp = "";
       
           BookBean bean = new BookBean();
           bean.setStoreBookUrl(bookUrl);
           
           //书名
           StartTag tag = source.getFirstStartTagByClass("BookTitle");
           tmp = tag.getRenderer().toString();
           bean.setName(tmp);
      
           //作者
           tag = source.getFirstStartTagByClass("bookAuthor");
           if (tag!=null){
            List<StartTag> list = tag.getElement().getAllStartTags(HTMLElementName.A);
            if (list.size()>0)
             bean.setAuthor(list.get(0).getElement().getContent().toString());
           }
           
           //书籍图片
           tag = source.getFirstStartTag("id", "BookImage", false);
           if (tag!=null)
            bean.setPicUrl(tag.getAttributeValue("src").trim());
           
           StartTag tagLeft = source.getFirstStartTagByClass("Left");
           tmp=tagLeft.getRenderer().toString();
           List<String> resList = new ArrayList<String>();
           String[] leftArray = tmp.split("·");
           for (String str:leftArray){      
            if ("".equals(str)) continue;
            resList.add(str);      
           }
           StartTag tagRight = source.getFirstStartTagByClass("Right");
           tmp = tagRight.getRenderer().toString();     
           String[] rightArray = tmp.split("·");
           for (String str:rightArray){
            if ("".equals(str)) continue;
            resList.add(str);      
           }     
           for (String str:resList){
            try{
             String name = CommonUtil.getString(str.split(":")[0]);
             String value = CommonUtil.getString(str.split(":")[1]);
              if ("ISBN".equals(name)) bean.setIsbn(value);   
             if ("出版社".equals(name)) bean.setPublisherOrg(value);
             if ("页码".equals(name)) bean.setPageNum(value);
             if ("出版日期".equals(name)) bean.setPublishDate(value);      
                      
             if ("装帧".equals(name)) bean.setWrapType(value);
             if ("开本".equals(name)) bean.setFormat(value);
            }catch(ArrayIndexOutOfBoundsException ee){}
           }
           
                
           //定价
           tag = source.getFirstStartTagByClass("BookPrice");
           String price = tag.getElement().getAllStartTags(HTMLElementName.STRIKE).get(0).getRenderer().toString();
           price = price.substring(1,price.length());
           bean.setPrice(price);
           
           //零售价格
           tag = source.getFirstStartTagByClass("DetailPrice");
           if (tag!=null)
            bean.setStorePrice(tag.getElement().getAllStartTagsByClass("OurPrice").get(0).getRenderer().toString());
           else
            bean.setStorePrice("0");
                   
           List<StartTag> tagList = source.getAllStartTagsByClass("ContentValue");
           if(tagList!=null && tagList.size()>1){
            // 内容简介
            tag = tagList.get(0);
            tmp = tag.getRenderer().toString().trim();
            if(tmp.length()>2000)
             tmp = tmp.substring(0, 1990)+"...";
            bean.setContent(tmp);
      
           }
           new BookBO().saveBook(bean);
              
              
                }else//如果没标志处理则休眠一秒再重新开始处理
                {
                    try
                    {
                        Thread.sleep(1000);
                    } catch (InterruptedException e)
                    {
                        e.printStackTrace();
                    }
                }
            }
           
        }
    }


    //CommonUtil中的方法,通过httpclient提交到url,返回的页面信息装入jericho的source
     public static Source getSourceByUrl(final String url) {
      Source source = null;
      HttpClient httpClient = new HttpClient();
      GetMethod getMethod = new GetMethod(url);
      getMethod.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);

      getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,
        new DefaultHttpMethodRetryHandler());
      try {
       int statusCode = httpClient.executeMethod(getMethod);
       if (statusCode != HttpStatus.SC_OK) {
        log.error("Method failed: " + getMethod.getStatusLine());
       }

       source = new Source(getMethod.getResponseBodyAsStream());
      } catch (HttpException e) {
       log.error("Please check your provided http address!");
       e.printStackTrace();
      } catch (IOException e) {
       e.printStackTrace();
      } finally {
       getMethod.releaseConnection();
      }
      return source;
     }

     关键字: httpclient, jericho, 多线程

  • 相关阅读:
    Dynamic导出解决方案修改其XML信息
    子网格
    官方文档
    ADFS登录页面自定义
    ADFS设置Tokn生命周期
    特征工程
    Pandas
    分类决策树
    Python基本知识
    机器学习的基本概念
  • 原文地址:https://www.cnblogs.com/webreport/p/1563852.html
Copyright © 2011-2022 走看看