zoukankan      html  css  js  c++  java
  • java扒取网页,获取所需要内容列表展示

    1.扒取原网页内容:

      

    2.本地展示效果

      

    3.代码

      

     3     @ResponseBody
     4     public Map<String, Object> findGuoWuYaoWen(HttpServletRequest request, ModelMap model, String area,String city) {
     5         Map<String, Object> map = new HashMap<String, Object>();
     6         Map<String, Object> result = new HashMap<String, Object>();
     7         List<NationalNews> nationalList = new ArrayList<>();
     8         String title,newsUrl,time;
     9         
    10         Elements elements = getUrlElements();
    11         System.out.println("========================"+elements.toString());
    12         List<Element> subList = elements.subList(0, 9);
    13         for (Element element : subList) {
    14              String  title123 = element.text();
    15              title =title123.substring(0, title123.length()-10) ;
    16              newsUrl = element.select("a").attr("href");
    17              if(!newsUrl.contains("www")){
    18                  newsUrl="http://www.gov.cn"+newsUrl;     
    19              }
    20              time = title123.substring(title123.length()-10,title123.length());
    21                 
    22              NationalNews nationalNews = new NationalNews(title, newsUrl, time);
    23              
    24              System.out.println(title+"=========================="+newsUrl+"=========================="+time);
    25              
    26              nationalList.add(nationalNews);
    27         }
    28         result.put("dataList", nationalList);
    29         return result;
    30     }
     1    private Elements getUrlElements() {
     2         System.out.println("=======================================抓取国家政府网内容任务开始了=======================================");
     3         String url = "http://www.gov.cn/pushinfo/v150203/index.htm";
     4         CloseableHttpClient httpClient = HttpClients.createDefault();
     5         Elements elementsByTag = null;
     6         
     7         try {
     8             HttpGet httpGet = new HttpGet(url);
     9             CloseableHttpResponse response = httpClient.execute(httpGet);
    10             try {
    11                 HttpEntity entity = response.getEntity();
    12                 //设置编码格式否则乱码
    13                 String html = new String(EntityUtils.toString(entity).getBytes("iso8859-1"));
    14                 
    15                  Document document = Jsoup.parse(html);
    16                  elementsByTag = document.getElementsByTag("li");
    17             }finally {     
    18                  response.close();
    19             }
    20         } catch (ClientProtocolException e1) {
    21             e1.printStackTrace();
    22         } catch (IOException e1) {
    23             e1.printStackTrace();
    24         }finally {
    25             // 关闭连接,释放资源
    26             try {
    27                 httpClient.close();
    28             } catch (IOException e) {
    29                 e.printStackTrace();
    30             }
    31         }
    32         
    33         return elementsByTag;
    34     }
    作者:iris_huahua
    本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须在文章页面给出原文连接,否则保留追究法律责任的权利。
  • 相关阅读:
    _bzoj1061 [Noi2008]志愿者招募【最小费用最大流】
    _bzoj2243 [SDOI2011]染色【树链剖分】
    _bzoj1013 [JSOI2008]球形空间产生器sphere【高斯消元】
    _bzoj1002 [FJOI2007]轮状病毒【瞎搞】
    leetcode 273 Integer to English Words
    leetcode 12 Integer to Roman
    leetcode 1071 Greatest Common Divisor of Strings
    lc6 ZigZag Conversion
    lc13 Roman to Integer
    leetcode 171 Excel Sheet Column Number
  • 原文地址:https://www.cnblogs.com/irishua/p/11099026.html
Copyright © 2011-2022 走看看