zoukankan      html  css  js  c++  java
  • HttpClient&Jsoup爬虫的简单应用

      详细的介绍已经有很多前辈总结,引用一下该篇文章:https://blog.csdn.net/zhuwukai/article/details/78644484

      下面是一个代码的示例:

    package com.http.client;
    
    import java.io.IOException;
    
    import org.apache.http.HttpHost;
    import org.apache.http.HttpResponse;
    import org.apache.http.client.ClientProtocolException;
    import org.apache.http.client.HttpClient;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.conn.params.ConnRouteParams;
    import org.apache.http.impl.client.DefaultHttpClient;
    import org.apache.http.params.CoreConnectionPNames;
    import org.apache.http.util.EntityUtils;
    import org.apache.log4j.Logger;
    /**
     * 
     * @author oo
     * @date 2018-04-04
     */
    public class MyHttpClient {
        private static Logger logger = Logger.getLogger(MyHttpClient.class);
    
        /**
         * 需求:使用httpclient 爬取 网站数据
         * 
         * @param args
         */
        public static void main(String[] args) {
            // 创建HttpClient 对象
            HttpClient hclient = new DefaultHttpClient();
    
            // 设置响应时间 传输源码时间 代理服务器(设置代理服务器的目的是:防止爬数据被封ip)
            hclient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 20000)
                    .setParameter(CoreConnectionPNames.SO_TIMEOUT, 20000)
                    .setParameter(ConnRouteParams.DEFAULT_PROXY, new HttpHost("111.155.116.237", 8123));
    
            HttpGet hGet = new HttpGet("http://www.itcast.cn/");
            String content = "";
            try {
                // 向网站发送请求,获取网页源码
                HttpResponse execute = hclient.execute(hGet);
                // EntityUtils工具类把网页实体转换成字符串
                content = EntityUtils.toString(execute.getEntity(), "utf-8");
            } catch (ClientProtocolException e) {
                e.printStackTrace();
                logger.error("********ClientProtocolException" + e);
            } catch (IOException e) {
                e.printStackTrace();
                logger.error("********IOException" + e);
            }
    
            System.out.println(content);
        }
    
    }

      使用Jsoup进行请求:

    package com.http.client;
    
    import java.io.IOException;
    
    import org.apache.log4j.Logger;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    public class MyJsoup {
        private static Logger logger = Logger.getLogger(MyJsoup.class);
    
        public static void main(String[] args) {
            try {
                // 使用jsoup 发送请求
                Document document = Jsoup.connect("http://www.itcast.cn").get();
    //            System.out.println(document);
                Elements elements = document.getElementsByTag("a");
                String val = elements.text();
                System.out.println(val);
                
                for (Element element : elements) {
                    System.out.println(element.text()+":"+element.attr("href"));
                }
            } catch (IOException e) {
                e.printStackTrace();
                logger.error("***********IOException: 连接失败" + e);
            }
        }
    
    }

      HttpClient 结合Jsoup:

     1 package com.http.client;
     2 
     3 import java.io.IOException;
     4 
     5 import org.apache.http.HttpResponse;
     6 import org.apache.http.client.ClientProtocolException;
     7 import org.apache.http.client.HttpClient;
     8 import org.apache.http.client.methods.HttpGet;
     9 import org.apache.http.impl.client.DefaultHttpClient;
    10 import org.apache.http.util.EntityUtils;
    11 import org.jsoup.Jsoup;
    12 import org.jsoup.nodes.Document;
    13 import org.jsoup.nodes.Element;
    14 import org.jsoup.select.Elements;
    15 
    16 public class HttpCLientAndJsoup {
    17 
    18     public static void main(String[] args) throws ClientProtocolException, IOException {
    19         // 创建HttpClient对象
    20         HttpClient hClient = new DefaultHttpClient();
    21         // 爬虫URL大部分都是get请求,创建get请求对象
    22         HttpGet hget = new HttpGet("http://www.itcast.cn/");
    23         // 向网站发送请求,获取网页源码
    24         HttpResponse response = hClient.execute(hget);
    25         // EntityUtils工具类把网页实体转换成字符串
    26         String content = EntityUtils.toString(response.getEntity(), "utf-8");
    27         // Jsoup负责解析网页
    28         Document doc = Jsoup.parse(content);
    29         // 使用元素选择器选择网页内容
    30         Elements elements = doc.select("div.salary_con li");
    31         // System.out.println(elements.text());
    32         for (Element element : elements) {
    33             String text = element.text();
    34             System.out.println(text);
    35         }
    36 
    37     }
    38 
    39 }
  • 相关阅读:
    听说-- 迭代
    听说
    听说---时序图
    闪屏
    WBS
    听说
    Agile Development ----敏捷开发
    软件测试
    需求分析:WBS图
    2048小游戏
  • 原文地址:https://www.cnblogs.com/lyc-smile/p/8744237.html
Copyright © 2011-2022 走看看