zoukankan      html  css  js  c++  java
  • JAVA爬虫——爬取采集北京市政百姓信件内容——首都之窗(采用htmlunit,webmagic)附源代码、htmlUnit webmagic JAR包

      由于首都之窗网站第二页和第二页网址不变,已经和林子雨老师教程相差甚远,所以现在选择htmlunit模拟点击,(跳转摁钮显示网页仍是第一页),所以本代码用的一直是点击下一页摁钮。

    爬取网址:http://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.flow

    获取代码:

      1 package util;
      2 
      3 import java.io.IOException;
      4 import java.util.ArrayList;
      5 import java.util.LinkedList;
      6 import java.util.List;
      7 
      8 import com.gargoylesoftware.htmlunit.BrowserVersion;
      9 import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
     10 import com.gargoylesoftware.htmlunit.ImmediateRefreshHandler;
     11 import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
     12 import com.gargoylesoftware.htmlunit.WebClient;
     13 import com.gargoylesoftware.htmlunit.html.HtmlElement;
     14 import com.gargoylesoftware.htmlunit.html.HtmlPage;
     15 
     16 public class 首都之窗 {
     17     static List<String> lines_zi=new LinkedList<String>();
     18     static List<String> lines_jian=new LinkedList<String>();
     19     static List<String> lines_tou=new LinkedList<String>();
     20     
     21     static String line;
     22     public static void Value_start()
     23     {
     24         // TODO 自动生成的方法存根
     25 WebClient webClient=new WebClient(BrowserVersion.CHROME); // 实例化Web客户端 
     26         
     27         System.out.println("AAAAAA");
     28         try {
     29             webClient.getOptions().setActiveXNative(false);
     30             //webClient.getOptions().setCssEnabled(false);
     31             //webClient.getOptions().setRedirectEnabled(true);
     32             webClient.getOptions().setJavaScriptEnabled(true);
     33             webClient.getOptions().setDoNotTrackEnabled(true);
     34             webClient.getOptions().setThrowExceptionOnScriptError(false);
     35             webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
     36             webClient.getCache().setMaxSize(100);
     37             webClient.getOptions().setJavaScriptEnabled(true);//运行js脚本执行
     38             webClient.setAjaxController(new NicelyResynchronizingAjaxController());//设置支持AJAX
     39             webClient.getOptions().setCssEnabled(false);//忽略css
     40             webClient.getOptions().setUseInsecureSSL(true);//ssl安全访问
     41             webClient.getOptions().setThrowExceptionOnScriptError(false);  //解析js出错时不抛异常
     42             //webClient.getOptions().setTimeout(50000);  //超时时间  ms
     43             webClient.getCookieManager().setCookiesEnabled(true);
     44             webClient.getCache().clear();
     45             webClient.setRefreshHandler(new ImmediateRefreshHandler());
     46             webClient.getOptions().setTimeout(2*1000);    //网页多少ms超时响应
     47             webClient.setJavaScriptTimeout(600*1000);   //javaScript多少ms超时
     48             webClient.setAjaxController(new NicelyResynchronizingAjaxController());  
     49             //webClient.setJavaScriptTimeout(600*1000);   
     50             //webClient.getOptions().setRedirectEnabled(true); 
     51             webClient.waitForBackgroundJavaScript(60*1000);
     52             
     53             HtmlPage page=webClient.getPage("http://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.flow"); // 解析获取页面
     54             HtmlElement a=page.getElementByName("nextPage");
     55             int j=1,lastj=0;
     56             FileHandle fh=new FileHandle();
     57             StringHandle sh=new StringHandle();
     58             List<String> lastInfo_zi=new ArrayList<String>();
     59             List<String> lastInfo_jian=new ArrayList<String>();
     60             List<String> lastInfo_tou=new ArrayList<String>();
     61             System.out.println("asdfsdaf");
     62             fh.outFile(""+"
    ", "E:\578095023\FileRecv\寒假作业\大三寒假作业\北京市政百姓信件分析实战\list.txt", false);
     63            
     64             while(j!=600)
     65             {
     66                 
     67                 String nowInfo=page.asXml();
     68 
     69                 List<String> infoList_zi=sh.getExpString("letterdetail\('.*?','.*?'\)", nowInfo);
     70                 int g_size_zi=infoList_zi.size();
     71                 if(sh.StringListSameOutStringList(infoList_zi, lastInfo_zi).size()!=g_size_zi&&g_size_zi==7)
     72                 {
     73                     //System.out.println(g_size);
     74                     for(int i=0;i<g_size_zi;i++)
     75                     {
     76                         String theWeb=infoList_zi.get(i).replaceAll("letterdetail\('.*?','", "").replace("')", "");
     77                         System.out.println(theWeb);
     78                         lines_zi.add(theWeb);
     79                         fh.outFile(theWeb+"
    ", "E:\578095023\FileRecv\寒假作业\大三寒假作业\北京市政百姓信件分析实战\list.txt", true);
     80                         
     81                         if(i==g_size_zi-1)
     82                         {
     83                             lastInfo_zi=infoList_zi;
     84                             System.out.println(j);
     85                             j++;
     86                             break;
     87                         }
     88                              
     89                     }
     90                     page=a.click();
     91                 }
     92                 //page=a.click();
     93             }
     94             
     95             
     96         }catch (FailingHttpStatusCodeException | IOException e) {
     97             // TODO Auto-generated catch block
     98             e.printStackTrace();
     99         } finally{
    100             webClient.close(); // 关闭客户端,释放内存
    101         }
    102     
    103     }
    104     public static void main(String[] args) {
    105         Value_start();
    106     }
    107 
    108 }
    getPass

    爬取详细数据:

      1 package util;
      2 import java.io.*;
      3 import java.util.List;
      4 
      5 import org.jsoup.Connection;
      6 import org.jsoup.Jsoup;
      7 import org.jsoup.nodes.Document;
      8 import org.jsoup.nodes.Element;
      9 import org.jsoup.select.Elements;
     10 
     11 import java.io.IOException;
     12 import java.util.ArrayList;
     13 import java.util.List;
     14 
     15 import org.jsoup.Connection;
     16 import org.jsoup.Jsoup;
     17 import org.jsoup.nodes.Document;
     18 import org.jsoup.nodes.Element;
     19 import org.jsoup.select.Elements;
     20 
     21 import util.SslUtils;
     22 
     23 import us.codecraft.webmagic.Page;
     24 import us.codecraft.webmagic.Site;
     25 import us.codecraft.webmagic.Spider;
     26 import us.codecraft.webmagic.processor.PageProcessor;
     27 
     28 import com.bean.InfoBean;
     29 import com.dao.InfoDao;
     30 public class pa2 implements PageProcessor {
     31     static int num=0;
     32     static String Id;
     33     static String Question;
     34     static String Question_user;
     35     static String Question_date;
     36     static String Question_info;
     37     static String Answer;
     38     static String Answer_user;
     39     static String Answer_date;
     40     static String Answer_info;
     41     static String Url;
     42     //static String regEx="[
    `~!@#$%^&()+=|{}':;',\[\].<>/?~!@#¥%……&*()——+|{}【】‘;:”“’。, 、?? ]";
     43     static String aa = "";//这里是将特殊字符换为aa字符串," "代表直接去掉
     44 
     45     // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
     46     private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
     47     private static int count =0;
     48 
     49     @Override
     50     public Site getSite() {
     51         return site;
     52     }
     53     //主页面
     54     public void parent(Page page)
     55     {
     56 
     57         System.out.println("抓取的内容
    "+
     58                 page.getHtml().xpath("//span[@name='cutStr' and @dispLength='68']//text()").get()
     59         );
     60     }
     61     //子页面
     62     public void child(Page page) throws IOException {
     63 
     64         System.out.println("RRRRRRRRR");
     65         BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(new File("E:\578095023\FileRecv\寒假作业\大三寒假作业\list.txt")),
     66                 "UTF-8"));
     67         String line=null;
     68         System.out.println("SSSSSSSS");
     69         while((line=br.readLine())!=null)
     70         {
     71 
     72             String url= "http://www.beijing.gov.cn/hudong/hdjl/com.web.consult.";
     73             String type="";//声明类型码
     74             type="consultDetail.flow?originalId=";
     75             url+=type;
     76             url+=line;
     77             System.out.println(url);
     78             page.addTargetRequest(url);
     79 
     80             url= "http://www.beijing.gov.cn/hudong/hdjl/com.web.consult.";
     81             type="";//声明类型码
     82             type="suggesDetail.flow?originalId=";
     83             url+=type;
     84             url+=line;
     85             System.out.println(url);
     86             page.addTargetRequest(url);
     87 
     88             url= "http://www.beijing.gov.cn/hudong/hdjl/com.web.consult.";
     89             type="";//声明类型码
     90             type="complainDetail.flow?originalId=";
     91             url+=type;
     92             url+=line;
     93             System.out.println(url);
     94             page.addTargetRequest(url);
     95         }
     96 
     97         if(page.getUrl().regex("http://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.flow").match())
     98         {
     99 
    100             parent(page);
    101         }
    102         else
    103         {
    104             Question=page.getHtml().xpath("//div[contains(@class, 'col-xs-10')]/strong//text()").get().trim();
    105             // Question=Question.replaceAll(regEx, aa);
    106 
    107             Question_user=page.getHtml().xpath("//div[contains(@class, 'col-xs-12') and contains(@class, 'my-3')]/div[contains(@class, 'col-xs-10') and contains(@class, 'text-muted')]//text()").get().trim();
    108             //Question_user=Question_user.replaceAll(regEx, aa);
    109             Question_user=Question_user.replaceAll("来信人", aa).trim();
    110             Question_user=Question_user.replaceAll(":", aa).trim();
    111             Question_date=page.getHtml().xpath("//div[contains(@class, 'col-xs-12')]/div[contains(@class, 'col-xs-5')]//text()").get();
    112             // Question=Question.replaceAll(regEx, aa);
    113             Question_date=Question_date.replaceAll("时间", aa).trim();
    114             Question_date=Question_date.replaceAll(":", aa).trim();
    115 
    116             Question_info=page.getHtml().xpath("//div[contains(@class, 'col-xs-12') and contains(@class, 'mx-2') ]//text()").get();
    117             //Question_info=Question_info.replaceAll(regEx, aa);
    118 
    119             Answer=page.getHtml().xpath("//div[contains(@class, 'col-xs-9') and contains(@class, 'my-2')]//text()").get();
    120             //Answer=Answer.replaceAll(regEx, aa);
    121 
    122             Answer_user=page.getHtml().xpath("//div[contains(@class, 'col-xs-9') and contains(@class, 'my-2')]//text()").get();
    123             // Answer_user=Answer_user.replaceAll(regEx, aa);
    124 
    125             Answer_date=page.getHtml().xpath("//div[contains(@class, 'col-xs-12') and contains(@class, 'col-sm-3')and contains(@class, 'col-md-3') and contains(@class, 'my-2')]//text()").get();
    126             // Answer_date=Answer_date.replaceAll(regEx, aa);
    127             Answer_date=Answer_date.replaceAll("答复时间", aa).trim();
    128             Answer_date=Answer_date.replaceAll(":", aa).trim();
    129 
    130 
    131             List<String> values=new ArrayList<String>();
    132             values=page.getHtml().xpath("//div[contains(@class, 'col-xs-12') and contains(@class, 'my-3')and contains(@class, 'p-4')]//*//text()").all();
    133             Answer_info=null;
    134             for(String value:values)
    135             {
    136                 Answer_info+=value;
    137             }
    138             if(Answer_info==null)
    139             {
    140                 Answer_info=page.getHtml().xpath("//div[contains(@class, 'col-xs-12') and contains(@class, 'my-3')and contains(@class, 'p-4')]//text()").get();
    141             }
    142             Answer_info=Answer_info.replaceAll("?", aa).trim();
    143             Answer_info=Answer_info.replaceAll("null", aa).trim();
    144 
    145             Url=page.getUrl().get();
    146             System.out.println("抓取的内容
    "+
    147                     page.getHtml().xpath("//div[contains(@class, 'col-xs-10')]/strong//text()").get()
    148             );
    149 
    150             System.out.println("Id:" + Id+
    151                     "
     Question:" + Question+
    152                     "\n Question_user:" + Question_user+
    153                     "
     Question_date:" + Question_date+
    154                     "
     Question_info:" + Question_info+
    155                     "
     Answer:" + Answer+
    156                     "
     Answer_user:" + Answer_user+
    157                     "
     Answer_date:" + Answer_date+
    158                     "
     Answer_info:"+Answer_info+
    159                     "
     Url:"+Url);
    160             InfoDao.add(Question, Question_user, Question_date, Question_info, Answer, Answer_user, Answer_date, Answer_info, Url);
    161         }
    162         count ++;
    163     }
    164     @Override
    165     public void process(Page page) {
    166         num=num+1;
    167         if(num==1)
    168         {
    169             try {
    170                 child(page);
    171             } catch (IOException e) {
    172                 e.printStackTrace();
    173             }
    174         }
    175         else
    176         {
    177             Question=page.getHtml().xpath("//div[contains(@class, 'col-xs-10')]/strong//text()").get().trim();
    178             // Question=Question.replaceAll(regEx, aa);
    179 
    180             Question_user=page.getHtml().xpath("//div[contains(@class, 'col-xs-12') and contains(@class, 'my-3')]/div[contains(@class, 'col-xs-10') and contains(@class, 'text-muted')]//text()").get().trim();
    181             //Question_user=Question_user.replaceAll(regEx, aa);
    182             Question_user=Question_user.replaceAll("来信人", aa).trim();
    183             Question_user=Question_user.replaceAll(":", aa).trim();
    184             Question_date=page.getHtml().xpath("//div[contains(@class, 'col-xs-12')]/div[contains(@class, 'col-xs-5')]//text()").get();
    185             // Question=Question.replaceAll(regEx, aa);
    186             Question_date=Question_date.replaceAll("时间", aa).trim();
    187             Question_date=Question_date.replaceAll(":", aa).trim();
    188 
    189             Question_info=page.getHtml().xpath("//div[contains(@class, 'col-xs-12') and contains(@class, 'mx-2') ]//text()").get();
    190             //Question_info=Question_info.replaceAll(regEx, aa);
    191 
    192             Answer=page.getHtml().xpath("//div[contains(@class, 'col-xs-9') and contains(@class, 'my-2')]//text()").get();
    193             //Answer=Answer.replaceAll(regEx, aa);
    194 
    195             Answer_user=page.getHtml().xpath("//div[contains(@class, 'col-xs-9') and contains(@class, 'my-2')]//text()").get();
    196             // Answer_user=Answer_user.replaceAll(regEx, aa);
    197 
    198             Answer_date=page.getHtml().xpath("//div[contains(@class, 'col-xs-12') and contains(@class, 'col-sm-3')and contains(@class, 'col-md-3') and contains(@class, 'my-2')]//text()").get();
    199             // Answer_date=Answer_date.replaceAll(regEx, aa);
    200             Answer_date=Answer_date.replaceAll("答复时间", aa).trim();
    201             Answer_date=Answer_date.replaceAll(":", aa).trim();
    202 
    203             List<String> values=new ArrayList<String>();
    204             values=page.getHtml().xpath("//div[contains(@class, 'col-xs-12') and contains(@class, 'my-3')and contains(@class, 'p-4')]//*//text()").all();
    205             Answer_info=null;
    206             for(String value:values)
    207             {
    208                 Answer_info+=value;
    209             }
    210             if(Answer_info==null)
    211             {
    212                 Answer_info=page.getHtml().xpath("//div[contains(@class, 'col-xs-12') and contains(@class, 'my-3')and contains(@class, 'p-4')]//text()").get();
    213             }
    214             Answer_info=Answer_info.replaceAll("?", aa).trim();
    215             Answer_info=Answer_info.replaceAll("null", aa).trim();
    216 
    217             Url=page.getUrl().get();
    218             System.out.println("抓取的内容
    "+
    219                     page.getHtml().xpath("//div[contains(@class, 'col-xs-10')]/strong//text()").get()
    220             );
    221 
    222             System.out.println("Id:" + Id+
    223                     "
     Question:" + Question+
    224                     "\n Question_user:" + Question_user+
    225                     "
     Question_date:" + Question_date+
    226                     "
     Question_info:" + Question_info+
    227                     "
     Answer:" + Answer+
    228                     "
     Answer_user:" + Answer_user+
    229                     "
     Answer_date:" + Answer_date+
    230                     "
     Answer_info:"+Answer_info+
    231                     "
     Url:"+Url);
    232             InfoDao.add(Question, Question_user, Question_date, Question_info, Answer, Answer_user, Answer_date, Answer_info, Url);
    233         }
    234 
    235     }
    236 
    237     public static void main(String[] args) {
    238         try {
    239             SslUtils.ignoreSsl();
    240         } catch (Exception e) {
    241             e.printStackTrace();
    242         }
    243         // jsoup("http://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.flow");
    244         long startTime, endTime;
    245         System.out.println("开始爬取...");
    246         InfoDao.delete();
    247         startTime = System.currentTimeMillis();
    248         Spider.create(new pa2()).addUrl("http://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.flow").thread(5).run();
    249         endTime = System.currentTimeMillis();
    250         System.out.println("爬取结束,耗时约" + ((endTime - startTime) / 1000) + "秒,抓取了"+count+"条记录");
    251     }
    252 
    253 
    254 }
    getInfo

    下载地址:

    https://github.com/Smartisa/JAVABeijing

  • 相关阅读:
    TCP
    关系型数据库基础
    spark教程(16)-Streaming 之 DStream 详解
    spark教程(15)-Streaming
    灰度图Matlab
    mesh函数
    axis函数
    Matlab提供了两种除法运算:左除()和右除(/)
    基和时间平移矩阵
    转载:实现MATLAB2016a和M文件关联
  • 原文地址:https://www.cnblogs.com/smartisn/p/12237534.html
Copyright © 2011-2022 走看看