zoukankan      html  css  js  c++  java
  • 利用Jsoup爬取新冠疫情数据并存至数据库

      需要用到的jar包(用来爬取的jsoup,htmlunit-2.37.0-bin以及连接数据库中的mysql.jar)

      链接:https://pan.baidu.com/s/1VlylWmlhjd8Ka8VTruQEnA     提取码:dxeq

      爬取的原网站为:https://wp.m.163.com/163/page/news/virus_report/index.html?_nw_=1&_anw_=1

    Paqu.java

    package control;
    import java.io.IOException;
    import java.text.SimpleDateFormat;
    import java.util.ArrayList;
    import java.util.Date;
    import java.util.List;
    
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    import com.gargoylesoftware.htmlunit.BrowserVersion;
    import com.gargoylesoftware.htmlunit.WebClient;
    import com.gargoylesoftware.htmlunit.html.HtmlInput;
    import com.gargoylesoftware.htmlunit.html.HtmlPage;
    import com.gargoylesoftware.htmlunit.html.HtmlSubmitInput;
    
    import Dao.AddService;
    
     
    public class Paqu {
    
        public static void main(String args[]) {
            // TODO Auto-generated method stub
            String sheng="";
            String xinzeng="";
            String leiji="";
            String zhiyu="";
            String siwang="";
             String url = "https://wp.m.163.com/163/page/news/virus_report/index.html?_nw_=1&_anw_=1";
            
            int i=0;
            
            try {
                //构造一个webClient 模拟Chrome 浏览器
                WebClient webClient = new WebClient(BrowserVersion.CHROME);
                //支持JavaScript
                webClient.getOptions().setJavaScriptEnabled(true);
                webClient.getOptions().setCssEnabled(false);
                webClient.getOptions().setActiveXNative(false);
                webClient.getOptions().setCssEnabled(false);
                webClient.getOptions().setThrowExceptionOnScriptError(false);
                webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
                webClient.getOptions().setTimeout(8000);
                HtmlPage rootPage = webClient.getPage(url);
                //设置一个运行JavaScript的时间
                webClient.waitForBackgroundJavaScript(6000);
                String html = rootPage.asXml();
                Document doc = Jsoup.parse(html);
                //System.out.println(doc);
                Element listdiv1 = doc.select(".wrap").first();
                Elements listdiv2 = listdiv1.select(".province");
                for(Element s:listdiv2) {
                    Elements span = s.getElementsByTag("span");
                    Elements real_name=span.select(".item_name");
                    Elements real_newconfirm=span.select(".item_newconfirm");
                    Elements real_confirm=span.select(".item_confirm");
                    Elements real_dead=span.select(".item_dead");
                    Elements real_heal=span.select(".item_heal");
                    sheng=real_name.text();
                    xinzeng=real_newconfirm.text();
                    leiji=real_confirm.text();
                    zhiyu=real_heal.text();
                    siwang=real_dead.text();
                    //System.out.println(sheng+" 新增确诊:"+xinzeng+" 累计确诊:"+leiji+" 累计治愈:"+zhiyu+" 累计死亡:"+siwang);
                    Date currentTime=new Date();
                    SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd HH:mm");
                    String time = formatter.format(currentTime);//获取当前时间
                    AddService dao=new AddService();
                    dao.add("myinfo", sheng, xinzeng, leiji, zhiyu, siwang,time);//将爬取到的数据添加至数据库
                }
                
                
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
                System.out.println("爬取失败");
            }
        }
        
    }

       

      AddService.java:

    package Dao;
    
    import java.sql.Connection;
    import java.sql.Statement;
    
    import utils.DBUtils;
    
    public class AddService {
        public void add(String table,String sheng,String xinzeng,String leiji,String zhiyu,String dead,String time) {
            String sql = "insert into "+table+" (Province,Newconfirmed_num ,Confirmed_num,Cured_num,Dead_num,Time) values('" + sheng + "','" + xinzeng +"','" + leiji +"','" + zhiyu + "','" + dead+ "','" + time+ "')";
            System.out.println(sql);
            Connection conn = DBUtils.getConn();
            Statement state = null;
            int a = 0;
            try {
                state = conn.createStatement();
                a=state.executeUpdate(sql);
            } catch (Exception e) {
                e.printStackTrace();
            } finally {
                DBUtils.close(state, conn);
            }        
        }
    }

    数据库建表如下:

     遇到的问题

      一开始的数据是动态加载的,无法获取确定的数据,最后在代码中添加了一段js内容来获取动态数据。

      其中还尝试过爬取其他的网站上的数据,但doc并不能很好的输出,只能输出网站的大框架,无法获取具体到内容。

  • 相关阅读:
    【JLOI 2015】城池攻占
    【BalticOI 2004】Sequence
    罗马游戏
    《STL源码剖析》STL迭代器分类
    《Effective C++》模版与泛型编程
    《Effective C++》继承与面向对象设计
    《Effective C++》实现 章节
    [C++]const_cast,dynamic_cast,reinterpret_cast,static_cast转型
    [C++]default constructor默认构造函数
    [C++]union联合体总结
  • 原文地址:https://www.cnblogs.com/xhj1074376195/p/12469256.html
Copyright © 2011-2022 走看看