zoukankan      html  css  js  c++  java
  • 新型冠状病毒 疫情 数据爬取(Java 含源码)

    代码:

      1 package 疫情;
      2 
      3 
      4 import java.text.SimpleDateFormat;
      5 import java.util.ArrayList;
      6 import java.util.Date;
      7 import java.util.List;
      8 
      9 import com.dao.InfoDao;
     10 import com.dao.YiDao;
     11 import org.jsoup.Jsoup;
     12 import org.jsoup.nodes.Document;
     13 import util.StringHandle;
     14 import us.codecraft.webmagic.Page;
     15 import us.codecraft.webmagic.Site;
     16 import us.codecraft.webmagic.Spider;
     17 import us.codecraft.webmagic.processor.PageProcessor;
     18 
     19 public class Info implements PageProcessor {
     20     static String regEx="[
    `'' " , ,]";
     21     static String aa="";
     22     // 抓取网站的相关配置,包括编码、抓取间隔、重试次数等
     23     private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
     24     private static int count =0;
     25 
     26     @Override
     27     public Site getSite() {
     28         return site;
     29     }
     30     @Override
     31     public void process(Page page) {
     32         Date format = new Date();
     33         SimpleDateFormat ft = new SimpleDateFormat ("yyyy-MM-dd hh:mm:ss");
     34         String date=ft.format(format);
     35         System.out.println("当前时间为: " + ft.format(format));
     36         //System.out.println(page.getHtml());
     37         StringHandle sh=new StringHandle();
     38         String test=page.getHtml().xpath("//script [@id='getAreaStat']").get();
     39         System.out.println(test);
     40 
     41 
     42 
     43         List<String> Provinces=sh.getExpString(""provinceName":"(.*?)","provinceShortName":"(.*?)","confirmedCount":(.*?),"suspectedCount":(.*?),"curedCount":(.*?),"deadCount":(.*?),"comment":(.*?)"","locationId":(.*?),"", test);
     44 
     45         for(String Province:Provinces)
     46         {
     47             String Province_names=sh.getExpString(""provinceName":"(.*?)"", Province).get(0).replaceAll(""provinceName":", "").replaceAll(regEx, aa);
     48 
     49             String Province_num_confirmed=sh.getExpString(""confirmedCount":(.*?),", Province).get(0).replaceAll(""confirmedCount":", "").replaceAll(regEx, aa);
     50 
     51 
     52             String Province_num_cured=sh.getExpString(""curedCount":(.*?),", Province).get(0).replaceAll(""curedCount":", "").replaceAll(regEx, aa);
     53 
     54             String Province_num_dead=sh.getExpString(""deadCount":(.*?),", Province).get(0).replaceAll(""deadCount":", "").replaceAll(regEx, aa);
     55 
     56             String Province_num_locationId=sh.getExpString(""locationId":(.*?),", Province).get(0).replaceAll(""locationId":", "").replaceAll(regEx, aa);
     57 
     58 
     59             YiDao.add("info",date,Province_names,"",Province_num_confirmed,"",Province_num_cured,Province_num_dead,Province_num_locationId);
     60         }
     61 
     62 
     63 
     64 
     65         List<String> citys=sh.getExpString(""cityName":"(.*?)","confirmedCount":(.*?),"suspectedCount":(.*?),"curedCount":(.*?),"deadCount":(.*?),"locationId":(.*?)}", test);
     66         System.out.println(citys.get(5));
     67         for(String city:citys)
     68         {
     69 
     70           //  String Province_names=sh.getExpString(""provinceName":"(.*?)"", city).get(0).replaceAll(""provinceName":", "").replaceAll(regEx, aa);
     71 
     72             String City_names=sh.getExpString(""cityName":"(.*?)"", city).get(0).replaceAll(""cityName":", "").replaceAll(regEx, aa);
     73 
     74             String City_num_confirmed=sh.getExpString(""confirmedCount":(.*?),", city).get(0).replaceAll(""confirmedCount":", "").replaceAll(regEx, aa);
     75 
     76             String City_num_cured=sh.getExpString(""curedCount":(.*?),", city).get(0).replaceAll(""curedCount":", "").replaceAll(regEx, aa);
     77 
     78             String City_num_dead=sh.getExpString(""deadCount":(.*?),", city).get(0).replaceAll(""deadCount":", "").replaceAll(regEx, aa);
     79 
     80             String City_num_locationId=sh.getExpString(""locationId":(.*?)}", city).get(0).replaceAll(""locationId":", "").replaceAll(regEx, aa);
     81 
     82             System.out.println(City_names+City_num_confirmed+""+City_num_cured+City_num_dead+City_num_locationId);
     83             YiDao.add("info",date,"",City_names,City_num_confirmed,"",City_num_cured,City_num_dead,City_num_locationId);
     84         }
     85 
     86 
     87         System.out.println("AAAA");
     88         System.out.println(citys.get(0));
     89 
     90 
     91         count ++;
     92     }
     93 
     94     public static void main(String[] args) {
     95         long startTime, endTime;
     96         System.out.println("开始爬取...");
     97         InfoDao.delete("info");
     98         startTime = System.currentTimeMillis();
     99         Spider.create(new Info()).addUrl("https://ncov.dxy.cn/ncovh5/view/pneumonia_peopleapp?from=timeline&isappinstalled=0").thread(5).run();
    100         endTime = System.currentTimeMillis();
    101         System.out.println("爬取结束,耗时约" + ((endTime - startTime) / 1000) + "秒,抓取了"+count+"条记录");
    102     }
    103 
    104 
    105 }

    效果图:

  • 相关阅读:
    【原创】cs+html+js+css模式(一):初识新模式
    【原创】cs+html+js+css模式(三):RemoteCallHandler详解
    删除表数据
    【原创】cs+html+js+css模式(二):webconfig中的设置
    silverlight动画
    rdlc报表表达式应用(字符串和转换)
    Accordion控件制作下拉面板菜单(静态数据)
    silverlight三种布局
    Sys.UI.DomElement
    Accordion控件动态数据绑定案例
  • 原文地址:https://www.cnblogs.com/smartisn/p/12283472.html
Copyright © 2011-2022 走看看