zoukankan      html  css  js  c++  java
  • 第四周--爬虫的学习

    我使用的是Java代码实现简单的爬虫

    爬取的丁香医生的疫情信息

    源代码:

    package com.fin.collect;
    import java.io.BufferedReader;
    import java.io.IOException;
    import java.io.InputStream;
    import java.io.InputStreamReader;
    import java.net.HttpURLConnection;
    import java.net.MalformedURLException;
    import java.net.URL;
    import java.sql.Connection;
    import java.sql.ResultSet;
    import java.sql.SQLException;
    import java.sql.Statement;
    import java.util.HashMap;
    import java.util.Map;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    import java.util.ArrayList;
    import java.util.List;
    
    import javax.net.ssl.HttpsURLConnection;
    
    import org.jsoup.Jsoup;
    
    import com.alibaba.fastjson.JSONArray;
    
    import net.sf.json.JSON;
    import net.sf.json.JSONObject;
    import com.fin.util.BaseConnection;
    
    public class CollectDataClass {
        public static void main(String[] args) throws IOException {
            getAreaStat();
        }
    
        // 根URL
        private static String httpRequset(String requesturl) throws IOException {
            StringBuffer buffer = null;
            BufferedReader bufferedReader = null;
            InputStreamReader inputStreamReader = null;
            InputStream inputStream = null;
            HttpsURLConnection httpsURLConnection = null;
            try {
                URL url = new URL(requesturl);
                httpsURLConnection = (HttpsURLConnection) url.openConnection();
                httpsURLConnection.setDoInput(true);
                httpsURLConnection.setRequestMethod("GET");
                inputStream = httpsURLConnection.getInputStream();
                inputStreamReader = new InputStreamReader(inputStream, "utf-8");
                bufferedReader = new BufferedReader(inputStreamReader);
                buffer = new StringBuffer();
                String str = null;
                while ((str = bufferedReader.readLine()) != null) {
                    buffer.append(str);
                }
            } catch (MalformedURLException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
    
            return buffer.toString();
        }
    
        /**
         * 获取全国各个省市的确诊、死亡和治愈人数
         * 
         * @return
         */
        public static String getAreaStat() {
            String url = "https://ncov.dxy.cn/ncovh5/view/pneumonia";
            String htmlResult = "";
            try {
                htmlResult = httpRequset(url);
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            // System.out.println(htmlResult);
    
            // 正则获取数据
            // 因为html的数据格式看着就像json格式,所以我们正则获取json
            String reg = "window.getAreaStat = (.*?)\}(?=catch)";
            Pattern totalPattern = Pattern.compile(reg);
            Matcher totalMatcher = totalPattern.matcher(htmlResult);
    
            String result = "";
            if (totalMatcher.find()) {
                result = totalMatcher.group(1);
                System.out.println(result);
                // 各个省市的是一个列表List,如果想保存到数据库中,要遍历结果,下面是demo
                JSONArray array = JSONArray.parseArray(result);
            
                try {
                    Connection conn = BaseConnection.getConnection();
                    Statement stmt = conn.createStatement();
    
                    for (int i = 0; i <= 30; i++) {
    
                        com.alibaba.fastjson.JSONObject jsonObject = com.alibaba.fastjson.JSONObject
                                .parseObject(array.getString(i));
                        String provinceName = jsonObject.getString("provinceName");
                            String current = jsonObject.getString("currentConfirmedCount");
                            String confirmed = jsonObject.getString("confirmedCount");
                            String cured = jsonObject.getString("curedCount");
                            String dead = jsonObject.getString("deadCount");
                            String suspect = jsonObject.getString("suspectedCount");
                            
                            stmt.executeUpdate( "insert into province(name,confirm,suspect,heal,dead,current,time) values('" +provinceName  + "','" +confirmed + "','" +suspect + "','" + cured + "','" + dead + "','" + current +  "')");
                            stmt.executeUpdate("update province set name='" + provinceName + "',confirm='" + confirmed + "',suspect='"+ suspect + "',heal='" + cured + "',dead='" + dead + "',current='" + current + "' where name='"+provinceName+"'");
                            
                            JSONArray array2 = jsonObject.getJSONArray("cities");
                            for (int j = 0; j < array2.size(); j++) {
                                com.alibaba.fastjson.JSONObject jsonObject2 = com.alibaba.fastjson.JSONObject
                                        .parseObject(array2.getString(j));
                                String cityname = jsonObject2.getString("cityName");
                                String current2 = jsonObject2.getString("currentConfirmedCount");
                                String confirmed2 = jsonObject2.getString("confirmedCount");
                                String cured2 = jsonObject2.getString("curedCount");
                                String dead2 = jsonObject2.getString("deadCount");
                                String suspect2 = jsonObject2.getString("suspectedCount");
                                System.out.println();
                                stmt.executeUpdate("update city set name='" + cityname + "',confirm='" + confirmed2 + "',suspect='"
                                        + suspect2 + "',heal='" + cured2 + "',dead='" + dead2 + "',current='" + current2 +"',province='"+provinceName+"' where name='"+cityname+"'");
                            }
                    }
                    stmt.close();
                    conn.close();
                } catch (SQLException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
            return result;
        }
    }
    View Code

    首先做一个爬虫我们得先去丁香医生的网址https://ncov.dxy.cn/ncovh5/view/pneumonia

    然后打开f12查找需要爬取的数据

  • 相关阅读:
    MFC下的各种字符串类型和相互转换
    LRESULT与wParam和lParam的问题
    C#.NET 消息机制
    Windows消息机制要点
    Windows 消息机制详解
    gb2312和UTF-8的区别
    DefWndProc/WndProc/IMessageFilter的区别
    结合windows消息系统理解C#中WndProc函数和DefWndProc函数
    Mono addin 学习笔记 5 TypeExtensionPoint
    Mono addin 学习笔记 4 再论数据扩展点(Data only extension point)
  • 原文地址:https://www.cnblogs.com/ljpljm/p/12555728.html
Copyright © 2011-2022 走看看