zoukankan      html  css  js  c++  java
  • 爬虫:HttpClient Jsoup 爬虫学习

    maven工程:

     log4j.properties

    log4j.rootLogger=DEBUG,A1
    log4j.logger.org.example=DEBUG
    log4j.appender.A1=org.apache.log4j.ConsoleAppender
    log4j.appender.A1.layout=org.apache.log4j.PatternLayout
    log4j.appender.A1.layout.ConversionPattern=%-d{yyyy-MM-dd HH:mm:ss,SSS} [%t] [%c]-[%p] %m%n

    pom.xml

    <?xml version="1.0" encoding="UTF-8"?>
    <project xmlns="http://maven.apache.org/POM/4.0.0"
             xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
             xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
        <modelVersion>4.0.0</modelVersion>
    
        <groupId>org.example</groupId>
        <artifactId>com.mwq.webmagic</artifactId>
        <version>1.0-SNAPSHOT</version>
        <dependencies>
            <dependency>
                <groupId>org.apache.httpcomponents</groupId>
                <artifactId>httpclient</artifactId>
                <version>4.5.2</version>
            </dependency>
            <!-- https://mvnrepository.com/artifact/org.slf4j/slf4j-log4j12 -->
            <dependency>
                <groupId>org.slf4j</groupId>
                <artifactId>slf4j-log4j12</artifactId>
                <version>1.7.25</version>
            </dependency>
            <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
            <dependency>
                <groupId>org.jsoup</groupId>
                <artifactId>jsoup</artifactId>
                <version>1.10.2</version>
            </dependency>
            <!-- https://mvnrepository.com/artifact/junit/junit -->
            <dependency>
                <groupId>junit</groupId>
                <artifactId>junit</artifactId>
                <version>4.12</version>
                <scope>test</scope>
            </dependency>
            <!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
            <dependency>
                <groupId>commons-io</groupId>
                <artifactId>commons-io</artifactId>
                <version>2.6</version>
            </dependency>
            <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
            <dependency>
                <groupId>org.apache.commons</groupId>
                <artifactId>commons-lang3</artifactId>
                <version>3.7</version>
            </dependency>
    
    
        </dependencies>
    
    
    
    </project>

    一、HttpClient模拟浏览器请求网页

    package com.mwq.crawler.test;
    
    import org.apache.http.HttpEntity;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    import org.apache.http.util.EntityUtils;
    
    import java.io.IOException;
    
    public class Crawler {
        public static void main(String[] args) throws Exception {
            //打开浏览器,创建hhtpclient对象
            CloseableHttpClient httpClient = HttpClients.createDefault();
    
            //输入网址,发起GET请求创建HttpGet对象
            HttpGet httpGet = new HttpGet("http://www.itcast.cn");
    
            //按回车,发起请求,返回响应,使用HttpClient对象发起请求
            CloseableHttpResponse response = httpClient.execute(httpGet);
    
            if(response.getStatusLine().getStatusCode() == 200){
                HttpEntity httpEntity = response.getEntity();
                String content = EntityUtils.toString(httpEntity,"utf-8");
    
                System.out.println(content);
            }
    
        }
    }

    二、HttpGet

    package com.mwq.crawler.test;
    
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    import org.apache.http.util.EntityUtils;
    
    import java.io.IOException;
    
    public class HttpGetTest {
        public static void main(String[] args) {
            //创建httpjclient 对象
            CloseableHttpClient httpClient = HttpClients.createDefault();
    
            //创建HttpGet对象,设置URL访问地址
            HttpGet httpGet = new HttpGet("http://www.itcast.cn");
    
            //使用HttpClient发起请求
            CloseableHttpResponse response = null;
            try {
                response = httpClient.execute(httpGet);
                if (response.getStatusLine().getStatusCode() == 200) {
                    String content = EntityUtils.toString(response.getEntity());
                    System.out.println(content.length());
                }
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                //关闭response
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
                try {
                    httpClient.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
    
        }
    }

    三、带参数的HttpGet

    package com.mwq.crawler.test;
    
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.client.utils.URIBuilder;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    import org.apache.http.util.EntityUtils;
    
    import java.io.IOException;
    import java.net.URISyntaxException;
    
    public class HttpGetParamTest {
        public static void main(String[] args) throws URISyntaxException {
            //创建httpjclient 对象
            CloseableHttpClient httpClient = HttpClients.createDefault();
    
            //设置请求地址是:http://yun.itheima.com/search?keys=Java
            //创建urlbuilder
            URIBuilder uriBuilder = new URIBuilder("http://yun.itheima.com/search");
            //设置参数
            uriBuilder.setParameter("keys","Java");
            //创建HttpGet对象,设置URL访问地址
            HttpGet httpGet = new HttpGet(uriBuilder.build());
            System.out.println("发起请求的信息:" + httpGet);
            //使用HttpClient发起请求
            CloseableHttpResponse response = null;
            try {
                response = httpClient.execute(httpGet);
                if (response.getStatusLine().getStatusCode() == 200) {
                    String content = EntityUtils.toString(response.getEntity());
                    System.out.println(content.length());
                }
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                //关闭response
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
                try {
                    httpClient.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
    
        }
    }

    四、HttpPost

    package com.mwq.crawler.test;
    
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.client.methods.HttpPost;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    import org.apache.http.util.EntityUtils;
    
    import java.io.IOException;
    
    public class HttpPostTest {
        public static void main(String[] args) {
            //创建httpjclient 对象
            CloseableHttpClient httpClient = HttpClients.createDefault();
    
            //创建HttpGet对象,设置URL访问地址
            HttpPost httpPost = new HttpPost("http://www.itcast.cn");
    
            //使用HttpClient发起请求
            CloseableHttpResponse response = null;
            try {
                response = httpClient.execute(httpPost);
                if (response.getStatusLine().getStatusCode() == 200) {
                    String content = EntityUtils.toString(response.getEntity());
                    System.out.println(content.length());
                }
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                //关闭response
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
                try {
                    httpClient.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
    
        }
    }

    五、带参数的HttpPost

    package com.mwq.crawler.test;
    
    import org.apache.http.NameValuePair;
    import org.apache.http.client.entity.UrlEncodedFormEntity;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.client.methods.HttpPost;
    import org.apache.http.client.utils.URIBuilder;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    import org.apache.http.message.BasicNameValuePair;
    import org.apache.http.util.EntityUtils;
    
    import java.io.IOException;
    import java.io.UnsupportedEncodingException;
    import java.net.URISyntaxException;
    import java.util.ArrayList;
    import java.util.List;
    
    public class HttpPostParamTest {
        public static void main(String[] args) throws URISyntaxException, Exception {
            //创建httpjclient 对象
            CloseableHttpClient httpClient = HttpClients.createDefault();
    
            //创建HttpGet对象,设置URL访问地址
            HttpPost httpPost = new HttpPost("http://yun.itheima.com/search");
            //声明LIST稽核,封装表单中的参数
            List<NameValuePair> params  = new ArrayList<NameValuePair>();
            //设置请求地址:http://yun.itheima.com/search?keys=Java
            params.add(new BasicNameValuePair("keys","Java"));
            //设置表单的Entity 对象,第一个参数就是封装号的表单数据,第二个参数就是编码
            UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params,"utf8");
            httpPost.setEntity(formEntity);
            System.out.println("发起请求的信息:" + httpPost);
            //使用HttpClient发起请求
            CloseableHttpResponse response = null;
            try {
                response = httpClient.execute(httpPost);
                if (response.getStatusLine().getStatusCode() == 200) {
                    String content = EntityUtils.toString(response.getEntity());
                    System.out.println(content.length());
                }
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                //关闭response
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
                try {
                    httpClient.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
    
        }
    }

    六、HttpConfig

    package com.mwq.crawler.test;
    
    import org.apache.http.client.config.RequestConfig;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    import org.apache.http.util.EntityUtils;
    
    import java.io.IOException;
    
    public class HttpConfigTest {
        public static void main(String[] args) {
            //创建httpjclient 对象
            CloseableHttpClient httpClient = HttpClients.createDefault();
    
            //创建HttpGet对象,设置URL访问地址
            HttpGet httpGet = new HttpGet("http://www.itcast.cn");
                //配置请求信息
            RequestConfig config= RequestConfig.custom().setConnectTimeout(1000) //创建连接的最长时间,单位毫秒
                  .setConnectionRequestTimeout(500) //设置获取连接的最长时间,单位是毫秒
                  .setSocketTimeout(10*1000) //设置数据传输的最长时间,单位是毫秒
                  .build();
            //给请求设置请求信息
            httpGet.setConfig(config);
            //使用HttpClient发起请求
            CloseableHttpResponse response = null;
            try {
                response = httpClient.execute(httpGet);
                if (response.getStatusLine().getStatusCode() == 200) {
                    String content = EntityUtils.toString(response.getEntity());
                    System.out.println(content.length());
                }
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                //关闭response
                try {
                    response.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
                try {
                    httpClient.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
    
        }
    }

    七、HttpClientPool

    package com.mwq.crawler.test;
    
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.impl.client.CloseableHttpClient;
    import org.apache.http.impl.client.HttpClients;
    import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
    import org.apache.http.util.EntityUtils;
    
    import java.io.IOException;
    
    public class HttpClientPoolTest {
        public static void main(String[] args) {
            //创建连接池管理器
            PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
    
            //设置连接最大连接数
            cm.setMaxTotal(100);
            //设置每个主机的最大连接数
            cm.setDefaultMaxPerRoute(10);
            //使用连接池发请求
            doGet(cm);
            doGet(cm);
        }
    
        private static void doGet(PoolingHttpClientConnectionManager cm) {
            //不是每次创建新的HttpClient,而是从连接池中获取HttpClient对象
            CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
            HttpGet httpGet = new HttpGet("http://www.itcast.cn");
            CloseableHttpResponse response = null;
            try {
                response = httpClient.execute(httpGet);
                if(response.getStatusLine().getStatusCode() == 200){
                    String content  = EntityUtils.toString(response.getEntity(),"utf8");
                    System.out.println(content.length());
    
                }
            } catch (IOException e) {
                e.printStackTrace();
            }finally {
                if(response !=null){
                    try {
                        response.close();
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                    //不能关闭,由连接池管理
    
                }
            }
        }
    
    }

    八、Jsoup

    package jsoup;
    /**
     * Jsoup解详文件,一般不用其爬取,因为其对一些技术支持不如httpclient号
     * 最终都是转为dom对象进行解析
     * 解析为元素后可以获取元素中的数据
     * selector查找元素单一方式,组合方式
     */
    
    import org.apache.commons.io.FileUtils;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Attributes;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    import org.junit.Test;
    
    import java.io.File;
    import java.io.IOException;
    import java.net.MalformedURLException;
    import java.net.URL;
    import java.util.Set;
    
    public class JsoupFirstTest {
    
            @Test
            public void testUrl()throws Exception{
                //解析URL地址,第一个参数是访问的URL,第二个url是访问超时时间
                Document doc = Jsoup.parse(new URL("http://www.itcast.cn"), 1000);
                //使用 标签选择器,获取title标签中的内容
                String title = doc.getElementsByTag("title").first().text();
                //dayin
                System.out.println(title);
            }
            @Test
            public void testString() throws Exception{
                //使用工具类读取文件,获取字符串
                String content = FileUtils.readFileToString(new File(""),"utf8");
                //解析字符串
                Document doc = Jsoup.parse(content);
                String title = doc.getElementsByTag("title").first().text();
                System.out.println(title);
            }
            @Test
            public void testFile() throws Exception{
                //解析文件
                Document doc = Jsoup.parse(new File(""), "utf8");
                String title = doc.getElementsByTag("title").first().text();
                System.out.println(title);
            }
            @Test
            public  void testDom() throws Exception{
                //解析文件获取document 对象
                Document doc = Jsoup.parse(new File(""), "utf8");
                //根据ID查询元素
                Element element = doc.getElementById("city_di");
                //根据标签获取元素
                Element element1 = doc.getElementsByTag("span").first();
                //根据CLASS获取元素
                Elements el = doc.getElementsByClass("class_a class_b");
                //根据属性获取元素
                Elements abc = doc.getElementsByAttribute("abc");
                Element href = doc.getElementsByAttributeValue("href", "http://sh.itcase.cn").first();
                //打印元素内容
                System.out.println(element.text());
                System.out.println(element1);
                System.out.println(el);
                System.out.println(href);
            }
            @Test
            public void testData() throws Exception{
                //解析文件,获取doc对象
                Document doc = Jsoup.parse(new File(""), "utf8");
                //根据ID获取元素
                Element element=doc.getElementById("test");
                String str = "";
                //从元素中获取数据
                //从元素中获取id
                str = element.id();
                //从元素中获取classname
                str = element.className();
                Set<String> classSet = element.classNames();
                for(String s :classSet){
                    System.out.println(s);
                }
                //从元素中获取属性值
               str =  element.attr("id");
                //从元素中获取所有属性
                Attributes attributes = element.attributes();
                System.out.println(attributes.toString());
                //从元素中获取文本内容
                str = element.text();
                System.out.println(str);
    
    
            }
            @Test
        public void testSelecter() throws Exception{
                //解析HTML文件,获取dom对象
                Document doc = Jsoup.parse(new File(""), "utf8");
                //通过标签查找元素
    //            Elements span = doc.select("span");
    //            for (Element element : span) {
    //                System.out.println(element.text());
    //            }
    //            Element element = doc.select("#city_bj").first();
    //            System.out.println(element.text());
                //通过class查找元素
                Element e = doc.select(".class_a").first();
                //通过属性查找元素
                Element element = doc.select("[abc]").first();
                //利用属性值查找元素
                Elements elements = doc.select("[class-s_name]");
                for (Element element1 : elements) {
                    System.out.println(element1.text());
                }
    
            }
            @Test
        public void testSelector2()throws Exception{
                //解析HTML文件,获取dom对象
                Document doc = Jsoup.parse(new File(""), "utf8");
                //元素+id
                Element select = doc.select("h3#city_bj").first();
                System.out.println(select.text());
                //元素+class
                select = doc.select("li.class_a").first();
                //元素+属性名
                select = doc.select("span[abc]").first();
                //任意组合
                select = doc.select("span[abc].s_name").first();
                //ancestor child:查找某个元素下子元素,比如:city_con li
                Elements select1 = doc.select(".city_con li");
                for (Element element : select1) {
                    System.out.println(element.text());
                }
                //parent > child:查找某个父元素下的直接子元素,如:
                //。city_con > ul >li 查找city_con第一级(直接子元素)的ul,在找所有ul下的第一级Li,ul和Li都是标签
                select1 = doc.select(".city_con > ul >li");
                //parent > *:查找父元素下所有直接子元素
                select1 = doc.select(".city_con > ul >*");
            }
    
    }
  • 相关阅读:
    十一、EnterpriseFrameWork框架的分层与系统业务的结合
    十八、【开源】EnterpriseFrameWork框架核心类库之Winform控制器
    进步的起点
    六、EnterpriseFrameWork框架基础功能之权限管理
    十四、EnterpriseFrameWork框架核心类库之简易ORM
    八、EnterpriseFrameWork框架基础功能之自定义报表
    五、利用EnterpriseFrameWork快速开发基于WebServices的接口
    十五、EnterpriseFrameWork框架核心类库之系统启动入口与初始化
    十二、EnterpriseFrameWork框架核心类库之与EntLib结合
    十七、EnterpriseFrameWork框架核心类库之Web控制器
  • 原文地址:https://www.cnblogs.com/mwq1992/p/14218971.html
Copyright © 2011-2022 走看看