zoukankan      html  css  js  c++  java
  • 使用Jsoup抓取网页数据

    Jsoup是一款Java的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。

    基本了解参考中文文档:http://www.open-open.com/jsoup/

    下面介绍一个具体例子:

    比如我们要抓取:http://datacenter.mep.gov.cn/report/air_daily/air_dairy.jsp上某个时段的所有城市的天气信息并且保存为csv格式的文件。

    在Chrome下先执行一次访问,查看页面访问参数:


    其中得到请求方式、URL、参数列表等信息。

    然后根据文档开始编写代码。

    项目结构如下:

    CityWeather.java:

     1 package bean;
     2 
     3 public class CityWeather 
     4 {
     5     public final static int ITEM_NUMBER = 6;
     6     private String id;
     7     private String city;
     8     private String date;
     9     private String aqi;
    10     private String aql;
    11     private String pp;
    12     
    13     public CityWeather(){}
    14     
    15     public CityWeather(String id, String city, String date, 
    16             String aqi, String aql, String pp)
    17     {
    18         this.id = id;
    19         this.city = city;
    20         this.date = date;
    21         this.aqi = aqi;
    22         this.aql = aql;
    23         this.pp = pp;
    24     }
    25     
    26     public String getId() 
    27     {
    28         return id;
    29     }
    30     
    31     public void setId(String id) 
    32     {
    33         this.id = id;
    34     }
    35     
    36     public String getCity() 
    37     {
    38         return city;
    39     }
    40     
    41     public void setCity(String city) 
    42     {
    43         this.city = city;
    44     }
    45     
    46     public String getDate() 
    47     {
    48         return date;
    49     }
    50     
    51     public void setDate(String date) 
    52     {
    53         this.date = date;
    54     }
    55     
    56     public String getAqi() 
    57     {
    58         return aqi;
    59     }
    60     
    61     public void setAqi(String aqi) 
    62     {
    63         this.aqi = aqi;
    64     }
    65     
    66     public String getAql() 
    67     {
    68         return aql;
    69     }
    70     
    71     public void setAql(String aql) 
    72     {
    73         this.aql = aql;
    74     }
    75     
    76     public String getPp() 
    77     {
    78         return pp;
    79     }
    80     
    81     public void setPp(String pp) 
    82     {
    83         this.pp = pp;
    84     }
    85     
    86     @Override
    87     public String toString()
    88     {
    89         return id + ", " + city + ", " + date + ", " + aqi +
    90                 ", " + aql + ", " + pp + "
    ";
    91     }
    92 }
    View Code

    War.java:

     1 package bean;
     2 
     3 public class War 
     4 {
     5     //定义常量
     6     public final static int GET = 0;
     7     public final static int POST = 1;
     8     //待访问网址的url
     9     private String url;
    10     //参数列表
    11     private String[] params;
    12     //对应参数值
    13     private String[] values;
    14     //请求类型,默认为GET
    15     private int requestMethod;
    16     //待获取元素的选择器语法表示
    17     private String selector;
    18     
    19     public War(String url, String[] params, String[] values, 
    20             int requestMethod, String selector)
    21     {
    22         this.url = url;
    23         this.params = params;
    24         this.values = values;
    25         this.requestMethod = requestMethod;
    26         this.setSelector(selector);
    27     }
    28     
    29     public String getUrl()
    30     {
    31         return url;
    32     }
    33     
    34     public void setUrl(String url)
    35     {
    36         this.url = url;
    37     }
    38     
    39     public String[] getParams()
    40     {
    41         return params;
    42     }
    43     
    44     public void setParams(String[] params)
    45     {
    46         this.params = params;
    47     }
    48     
    49     public String[] getValues()
    50     {
    51         return values;
    52     }
    53     
    54     public void setValues(String[] values)
    55     {
    56         this.values = values;
    57     }
    58     
    59     public int getRequestMethod()
    60     {
    61         return requestMethod;
    62     }
    63     
    64     public void setRequestMethod(int requestMethod)
    65     {
    66         this.requestMethod = requestMethod;
    67     }
    68 
    69     public String getSelector() 
    70     {
    71         return selector;
    72     }
    73 
    74     public void setSelector(String selector) 
    75     {
    76         this.selector = selector;
    77     }
    78 }
    View Code

    Op.java:

     1 package io;
     2 import java.io.BufferedWriter;
     3 import java.io.FileWriter;
     4 import java.io.IOException;
     5 import java.io.Writer;
     6 import java.util.ArrayList;
     7 import bean.CityWeather;
     8 
     9 public class Op 
    10 {
    11     public static void List2File(ArrayList<CityWeather> list, String file) throws IOException
    12     {
    13           Writer w = new FileWriter(file, true);
    14         BufferedWriter buffWriter = new BufferedWriter(w);
    15         for(CityWeather cw : list)
    16         {
    17             buffWriter.write(cw.toString());
    18         }
    19         buffWriter.close();
    20         w.close();
    21     }
    22 }
    View Code

    Crawler.java:

     1 package service;
     2 import bean.CityWeather;
     3 import bean.War;
     4 import org.jsoup.Jsoup;
     5 import org.jsoup.nodes.Document;
     6 import org.jsoup.nodes.Element;
     7 import org.jsoup.select.Elements;
     8 import org.jsoup.Connection;
     9 import java.io.IOException;
    10 import java.util.ArrayList;
    11 
    12 public class Crawler 
    13 {
    14     public static ArrayList<CityWeather> getResults(War war) throws IOException  
    15     {
    16         Connection conn = Jsoup.connect(war.getUrl());
    17         if(war.getParams() != null)
    18         {
    19             for(int i = 0; i < war.getParams().length; i++)
    20             {
    21                 conn.data(war.getParams()[i], war.getValues()[i]);
    22             }
    23         }
    24         Document doc = null;
    25         if(war.getRequestMethod() == War.GET)
    26         {
    27             doc = conn.timeout(10000).get();
    28         }
    29         else
    30         {
    31             doc = conn.timeout(10000).post();
    32         }
    33         if(doc == null)
    34         {
    35             System.out.print("unavailable
    ");
    36             return null;
    37         }
    38         Elements results = doc.select(war.getSelector());
    39         ArrayList<CityWeather> list = new ArrayList<CityWeather>();
    40         int i = 0;
    41         CityWeather cw = null;
    42         for(Element e: results)
    43         {
    44             switch(i % CityWeather.ITEM_NUMBER)
    45             {
    46             case 0:
    47                 cw = new CityWeather();
    48                 cw.setId(e.text());
    49                 break;
    50             case 1:
    51                 cw.setCity(e.text());
    52                 break;
    53             case 2:
    54                 cw.setDate(e.text());
    55                 break;
    56             case 3:
    57                 cw.setAqi(e.text());
    58                 break;
    59             case 4:
    60                 cw.setAql(e.text());
    61                 break;
    62             default:
    63                 cw.setPp(e.text().replace(',', '|'));
    64                 list.add(cw);
    65             }
    66             i++;
    67         }
    68         return list;
    69     }
    70 }
    View Code

    Test.java:

     1 package test;
     2 import java.text.DateFormat;
     3 import java.util.ArrayList;
     4 import java.util.Date;
     5 import service.Crawler;
     6 import bean.CityWeather;
     7 import bean.War;
     8 import io.Op;
     9 
    10 public class Test 
    11 {
    12     public static void main(String[] args) throws Exception
    13     {
    14         System.out.println(DateFormat.getDateTimeInstance().format(new Date()) + " 开始运行");
    15         String url = "http://datacenter.mep.gov.cn/report/air_daily/air_dairy.jsp";
    16         String[] params = { "city", "startdate", "enddate", "page" };
    17         String[] values = { "", "2016-05-01", "2016-05-01", "2" };
    18         int requestMethod = War.GET;
    19         String selector = "td.report1_6";
    20         War war = new War(url, params, values, requestMethod, selector);
    21         ArrayList<CityWeather> list = Crawler.getResults(war);
    22         Op.List2File(list, "data/file.csv");
    23         System.out.println(DateFormat.getDateTimeInstance().format(new Date()) + " 运行结束");
    24     }
    25 }
    View Code

    经过测试:抓取14年到16年的天气信息,跑的非常慢...大概跑了一天?...

    而且访问有时候会失败,需要对ArrayList<CityWeather> list = Crawler.getResults(war);循环调用直到list的size不为0表示访问成功才可以访问下一页。

    没有异常处理...

    我在考虑是不是可以多线程同时访问一页最后再把结果merge起来,这样肯定快很多吧。

    有空实现一下。

    今天试了一下,开了1000个线程网站就暂时性bug了...

    看来还要适当控制一下...果然多线程、分布式之类的还需要花时间仔细研究一下。

    代码下载:https://github.com/SplayHuo/JavaProject

  • 相关阅读:
    AOP Aspect 统一日志、异常处理、数据格式
    java基本成员默认值
    Jackson ObjectMapper
    logstash 安装 jdbc-output出错
    ElasticSearch定时删除数据(非时间结尾规律索引)
    docker安装部署
    K8s 使用helm 安装 EFK和ELK分布式日志分析系统系列(es版本:6.7.0;)
    JWT 验证
    JS查找数组中元素index
    oracle not in 失效
  • 原文地址:https://www.cnblogs.com/huoxiayu/p/5459134.html
Copyright © 2011-2022 走看看