zoukankan      html  css  js  c++  java
  • 3月12日第四周上课,疫情可视化加爬虫学习

    源码,爬虫部分

    package utils;
    
    import java.io.IOException;
    import java.io.UnsupportedEncodingException;
    import java.net.URISyntaxException;
    import java.util.ArrayList;
    import java.util.List;
    
    import org.apache.http.HttpEntity;
    import org.apache.http.NameValuePair;
    import org.apache.http.client.config.RequestConfig;
    import org.apache.http.client.entity.UrlEncodedFormEntity;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.client.methods.HttpPost;
    import org.apache.http.client.utils.URIBuilder;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
    import org.apache.http.message.BasicNameValuePair;
    import org.apache.http.util.EntityUtils;
    
    import com.alibaba.fastjson.JSONObject;
    
    public class HttpClientPool {
    /**
     * 这是httpClient连接池
     * @throws Exception 
     */
        public static void HttpClientPool() {
            //创建连接池管理器
            PoolingHttpClientConnectionManager cm =new  PoolingHttpClientConnectionManager();
            
            
            //设置最大连接数
            cm.setMaxTotal(100);
            //设置每个主机的最大连接数
            cm.setDefaultMaxPerRoute(10);
            //使用连接池管理器发起请求
    //        doGet(cm);
    //        doPost(cm);
        }
    
    public static String doPost(PoolingHttpClientConnectionManager cm) throws Exception {
        //从连接池中获取httpClient对象
            CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
    
              //2、输入网址,发起请求,创建httpPost对象
              HttpPost httpPost= new HttpPost("http://m.sinovision.net/newpneumonia.php");
              System.out.println("发起请求的信息:"+httpPost);
              
              //Post使用,声明List集合,封装表单中的参数
              List<NameValuePair> params= new ArrayList<NameValuePair>();
              params.add(new BasicNameValuePair("",""));
              
              //创建表单的Entity对象,第一个参数是封装好的参数,第二个是编码
              UrlEncodedFormEntity formEntity= new UrlEncodedFormEntity(params,"utf8");
              
              //设置表单的Entity对象到Post请求中
              httpPost.setEntity(formEntity);
              
              
            //配置请求信息
              RequestConfig config = RequestConfig.custom().setConnectTimeout(1000)//设置创建连接的最长时间,单位为毫秒
              .setConnectionRequestTimeout(500)//设置获取连接的最长时间,单位为毫秒
              .setSocketTimeout(10*1000)//设置传输数据的最长时间,单位为毫秒
              .build();
              //给请求设置请求信息
              httpPost.setConfig(config);
              
              CloseableHttpResponse response=null;
              String content=null;
              try {
              //3、按回车,发起请求,返回响应,使用httpClient对象发起请求
               response = httpClient.execute(httpPost);
              //解析响应,获取数据
              //判断状态码是否为两百
              if(response.getStatusLine().getStatusCode()==200) {
                  HttpEntity httpEntity = response.getEntity();
                  if(httpEntity!=null) {
                      content = EntityUtils.toString(httpEntity, "utf8");
                      System.out.println(content.length());
    //                  System.out.println(content);
                      }
              }else {
                  System.out.println("请求失败"+response);
              }
              }catch(Exception e) {
                  e.printStackTrace();
              }finally {
        
                  try {
                    //关闭response
                      if(response!=null) {
                          //关闭response 
                          response.close();
                      }
                    //不关闭httpClient
                    //httpClient.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
              }
              return content;              
    }
    
    public static String doGet(PoolingHttpClientConnectionManager cm) throws Exception {
        //从连接池中获取httpClient对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
        //创建URIBuilder
          URIBuilder uribuilder= new URIBuilder("http://m.sinovision.net/newpneumonia.php");
          //设置参数:参数名+参数值,可设置多个
          uribuilder.setParameter("","");
          
          //2、输入网址,发起请求,创建httpGet对象
          HttpGet httpGet= new HttpGet(uribuilder.build());
          System.out.println("发起请求的信息:"+httpGet);
          
          //配置请求信息
          RequestConfig config = RequestConfig.custom().setConnectTimeout(1000000000*1000000000)//设置创建连接的最长时间,单位为毫秒
          .setConnectionRequestTimeout(1000000000*1000000000)//设置获取连接的最长时间,单位为毫秒
          .setSocketTimeout(1000000000*1000000000)//设置传输数据的最长时间,单位为毫秒
          .build();
          //给请求设置请求信息
          httpGet.setConfig(config);
          
          
          
          
          
          CloseableHttpResponse response=null;
          String content=null;
          try {
          //3、按回车,发起请求,返回响应,使用httpClient对象发起请求
           response = httpClient.execute(httpGet);
          //解析响应,获取数据
          //判断状态码是否为两百
          if(response.getStatusLine().getStatusCode()==200) {
              HttpEntity httpEntity = response.getEntity();
              if(httpEntity!=null) {
              content = EntityUtils.toString(httpEntity, "utf8");
    //          System.out.println(content.length());
    //          System.out.println(content);
              }
          }
          }catch(Exception e) {
              e.printStackTrace();
          }finally {
    
              try {
                  if(response!=null) {
                      //关闭response 
                      response.close();
                  }
                //不能关闭httpClient
                //httpClient.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
          }
          return content; 
    }
    }
    package utils;
    import java.io.IOException;
    import java.net.MalformedURLException;
    import java.net.URL;
    import java.util.Set;
    
    import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Attributes;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    import dao.dao;
    import entity.Info;
    public class Jsouputil {
    
        /**
         * 使用Selector选择器获取元素
         */
        public static void testSelector()throws Exception{
            //获取Document对象
            HttpClientPool httpClient =new HttpClientPool();
            //创建连接池管理器
            PoolingHttpClientConnectionManager cm =new  PoolingHttpClientConnectionManager();
            //获取网页HTML字符串
            String content=httpClient.doGet(cm);
                            
            //解析字符串
            Document doc = Jsoup.parse(content);
    //        System.out.println(doc.toString());
        
            //[attr=value],利用属性获取
            Elements elements = doc.select("div[class=todaydata] ").select("div[class=prod]");
            System.out.println(elements.toString());
            Info info=new Info();
            dao dao=new dao();
            for(Element ele:elements) {
                String province=ele.select("span[class=area]").text();
                String Confirmed_num=ele.select("span[class=confirm]").text();
                String Dead_num=ele.select("span[class=dead]").text();
                String Cured_num=ele.select("span[class=cured]").text();
                info=new Info(province,Confirmed_num,Dead_num,Cured_num);
                dao.add(info);
            }
            
        }
    }

    主要参考了https://home.cnblogs.com/u/yeyueweiliang

    然后就是其中遇到的麻烦:

    在找网站爬墙的时候先是找的这个网站

    https://wp.m.163.com/163/page/news/virus_report/index.html?_nw_=1&_anw_=1

    结果某易发现他有反爬虫的功能,就又换了一个网址

    http://m.sinovision.net/newpneumonia.php

    具体位置如下:

    中间又出现导不到数据库 的情况,然后发现是他跟我原来的数据库结构不一样,导致最后先是不出来图像

    然后又 出现tomcat崩溃等一系列情况,都被重启服务,或者换端口,或者重启电脑解决了

    本次学习爬虫,在学习上面花费三个小时,实践花费两小时,解决问题花费两小时

    具体源码可以见我github库

    https://github.com/diaolingjun/-2

  • 相关阅读:
    大一训练赛20181105-二分三分分治部分
    该说命运弄人,毫不留情。
    矩阵快速幂模板
    Final Destination II -- 矩阵快速幂模板题
    UVA -580 组合数学
    NYOJ-16-矩形嵌套 记忆化搜索
    封装标签省,市,县。三级联动
    java压缩图片设置宽高
    sql分页
    常用的正则表达式@java后台
  • 原文地址:https://www.cnblogs.com/520520520zl/p/12467486.html
Copyright © 2011-2022 走看看