zoukankan      html  css  js  c++  java
  • 软件工程第四周进度总结

    本周学习了java爬虫的相关知识。

    代码量500 博客数1

    一、Get请求

    public class Web {
        static final Log logger = LogFactory.getLog(Web.class);
          public static void main(String[] args) throws Exception  {
              
    
              //1、打开浏览器,创建httpClient对象
              CloseableHttpClient httpClient = HttpClients.createDefault();
              
              //创建URIBuilder
              URIBuilder uribuilder= new URIBuilder("https://www.qidian.com");
              //设置参数:参数名+参数值,可设置多个
              uribuilder.setParameter("key","xuanhuan").setParameter("", "");
              
            //2、输入网址,发起请求,创建httpGet对象
              HttpGet httpGet= new HttpGet(uribuilder.build());
              System.out.println("发起请求的信息:"+httpGet);
              
              CloseableHttpResponse response=null;
              try {
              //3、按回车,发起请求,返回响应,使用httpClient对象发起请求
               response = httpClient.execute(httpGet);
              //解析响应,获取数据
              //判断状态码是否为两百
              if(response.getStatusLine().getStatusCode()==200) {
                  HttpEntity httpEntity = response.getEntity();
                  String content = EntityUtils.toString(httpEntity, "utf8");
                  System.out.println(content.length());
                  System.out.println(content);
              }
              }catch(Exception e) {
                  e.printStackTrace();
              }finally {
        
                  try {
                    //关闭response
                    response.close();
                    //关闭httpClient
                    httpClient.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
              }
              
        }
    
    
    }

    二、Post请求

    public class Web {
        static final Log logger = LogFactory.getLog(Web.class);
          public static void main(String[] args) throws Exception  {
              
    
              //1、打开浏览器,创建httpClient对象
              CloseableHttpClient httpClient = HttpClients.createDefault();
              
           
              //2、输入网址,发起请求,创建httpPost对象
              HttpPost httpPost= new HttpPost("https://www.baidu.com/index.php");
              System.out.println("发起请求的信息:"+httpPost);
              
              //Post使用,声明List集合,封装表单中的参数
              List<NameValuePair> params= new ArrayList<NameValuePair>();
              params.add(new BasicNameValuePair("",""));
              
              //创建表单的Entity对象,第一个参数是封装好的参数,第二个是编码
              UrlEncodedFormEntity formEntity= new UrlEncodedFormEntity(params,"utf8");
              
              //设置表单的Entity对象到Post请求中
              httpPost.setEntity(formEntity);
              
    
              
              CloseableHttpResponse response=null;
              try {
              //3、按回车,发起请求,返回响应,使用httpClient对象发起请求
               response = httpClient.execute(httpPost);
              //解析响应,获取数据
              //判断状态码是否为两百
              if(response.getStatusLine().getStatusCode()==200) {
                  HttpEntity httpEntity = response.getEntity();
                  String content = EntityUtils.toString(httpEntity, "utf8");
                  System.out.println(content.length());
    //              System.out.println(content);
              }else {
                  System.out.println("请求失败"+response);
              }
              }catch(Exception e) {
                  e.printStackTrace();
              }finally {
        
                  try {
                    //关闭response
                    response.close();
                    //关闭httpClient
                    httpClient.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
              }
              
        }
    
    
    }

    三、Jsoup解析HTML获取DOM

    public class Jsouputil {
    
        public static void main(String[] args) throws Exception {
            testUrl();
            testString();
        }
        /**
         * 解析URL
         * @throws Exception
         */
        public static  void testUrl() throws Exception {
            //解析URL,第一个参数是URL,第二个是访问的超时时间
            Document doc = Jsoup.parse(new URL("https://www.qidian.com"), 1000);
            //使用标签选择器,获取title标签里的内容
            String title = doc.getElementsByTag("title").first().text();
            System.out.println(title);
            
        }
        /**
         * 解析字符串
         */
        public static  void testString() throws Exception {
            HttpClientPool httpClient =new HttpClientPool();
            //创建连接池管理器
            PoolingHttpClientConnectionManager cm =new  PoolingHttpClientConnectionManager();
            //获取网页HTML字符串
            String content=httpClient.doGet(cm);
            
            //解析字符串
            Document doc = Jsoup.parse(content);
            String title = doc.getElementsByTag("title").first().text();
            System.out.println(title);
        }
    }

    四、DOM的方式获取元素

    public static void testDom()throws Exception{
            //获取Document对象
            HttpClientPool httpClient =new HttpClientPool();
            //创建连接池管理器
            PoolingHttpClientConnectionManager cm =new  PoolingHttpClientConnectionManager();
            //获取网页HTML字符串
            String content=httpClient.doGet(cm);
            
            //解析字符串
            Document doc = Jsoup.parse(content);
            
            
    //        1、根据id查询元素getElementById
            Element elementById = doc.getElementById("overseas_tit");
            System.out.println(elementById.text());
    //        2、根据标签获取元素getElementsByTag
            Elements elementsByTag = doc.getElementsByTag("span");
            System.out.println(elementsByTag.text());
    //        3、根据class获取元素getElementsByClass
            Elements elementsByClass = doc.getElementsByClass("chart_table_th");
            System.out.println(elementsByClass.text());
    //        4、根据属性获取元素getElementsByAttribute
            Elements elementsByAttribute = doc.getElementsByAttribute("src");
            Elements elementsByAttributeValue = doc.getElementsByAttributeValue("class", "chart_table_name");
            System.out.println(elementsByAttribute);
            System.out.println(elementsByAttributeValue.text());
        }

    五、获取元素中的数据

    public static void testData()throws Exception{
            //获取Document对象
            HttpClientPool httpClient =new HttpClientPool();
            //创建连接池管理器
            PoolingHttpClientConnectionManager cm =new  PoolingHttpClientConnectionManager();
            //获取网页HTML字符串
            String content=httpClient.doGet(cm);
                    
            //解析字符串
            Document doc = Jsoup.parse(content);
            Element element = doc.getElementById("overseas_tit");
            System.out.println(element);
            String str=null;
            //获取元素中的内容
            
    //        //获取id
    //        str=element.id();
    //        System.out.println("id:"+str);
    //        //获取className
    //        str=element.className();
    //        Set<String> classSet=element.classNames();
    //        for(String s:classSet) {
    //            System.out.println(s);
    //        }
    //        System.out.println("className:"+str);
    //        //获取属性的值attr
    //        str=element.attr("class");
    //        System.out.println(str);
            //获取所有属性attributes
            Attributes attributes = element.attributes();
            System.out.println(attributes.toString());
            
            //获取文本内容
            str=element.text();
            System.out.println(str);
        }

    六、Selector选择器获取元素

        /**
         * 使用Selector选择器获取元素
         */
        public static void testSelector()throws Exception{
            //获取Document对象
            HttpClientPool httpClient =new HttpClientPool();
            //创建连接池管理器
            PoolingHttpClientConnectionManager cm =new  PoolingHttpClientConnectionManager();
            //获取网页HTML字符串
            String content=httpClient.doGet(cm);
                            
            //解析字符串
            Document doc = Jsoup.parse(content);
            
    //        //tagName,通过标签查找元素
    //        Elements elements = doc.select("span");
    //        for(Element element:elements) {
    //            System.out.println(element.text());
    //        }
    //        
            
    //        //#id,通过id查找
    //        Element e = doc.select("#overseas_tit").first();
    //        System.out.println(e.text());
    //        
    //        
    //        //.class,通过class查找
    //        Element element = doc.select(".chart_table_name").first();
    //        System.out.println(element.text());
    //        
    //        
    //        //[attribute],利用属性获取
    //        Element element = doc.select("[class]").first();
    //        
    //        System.out.println(element.text());
            
            
            //[attr=value],利用属性获取
            Elements element = doc.select("[class=chart_table_name]");
                    
            System.out.println(element.text());
            
            //el#id:元素+id,h3#city_bj
            //el.class:元素+class
            //el[attr]:元素+属性名
            //任意组合
            //ancestor child:查找某个元素下子元素
            //parent > child:查找某个父元素下的直接子元素
            //parent >  *:查找某个父元素下的所有子元素
        }
  • 相关阅读:
    Spark SQL学习笔记
    《空空》陈粒
    支持向量机
    p.Value越显著,X变量越重要嘛?
    回归的武林绝学
    Neural Collaborative Filtering论文笔记
    make 学习笔记
    『并发包入坑指北』之阻塞队列
    线程池中你不容错过的一些细节
    利用策略模式优化过多 if else 代码
  • 原文地址:https://www.cnblogs.com/yeyueweiliang/p/12499960.html
Copyright © 2011-2022 走看看