zoukankan      html  css  js  c++  java
  • 【爬虫入门】HttpClient+Jsoup进行简单的网页访问和信息保存

    【项目选型】

    (Maven)SpringBoot+JPA

    【项目搭建】

    pom.xml:

    <parent>
            <artifactId>spring-boot-starter-parent</artifactId>
            <groupId>org.springframework.boot</groupId>
            <version>2.5.0</version>
        </parent>
    
        <dependencies>
    
            <!--spring-boot-mvc-->
            <dependency>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-starter-web</artifactId>
            </dependency>
    
            <!--springData JPA-->
            <dependency>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-starter-data-jpa</artifactId>
            </dependency>
    
            <!--mysql-->
            <dependency>
                <groupId>mysql</groupId>
                <artifactId>mysql-connector-java</artifactId>
            </dependency>
    
            <dependency>
                <groupId>org.apache.httpcomponents</groupId>
                <artifactId>httpclient</artifactId>
            </dependency>
    
            <dependency>
                <groupId>org.jsoup</groupId>
                <artifactId>jsoup</artifactId>
                <version>1.14.2</version>
            </dependency>
    
            <dependency>
                <groupId>org.apache.commons</groupId>
                <artifactId>commons-lang3</artifactId>
            </dependency>
    
        </dependencies>
    View Code

    AppMain.class:

    @SpringBootApplication
    /**
     * 开启定时任务
     */
    @EnableScheduling
    public class AppMain {
        public static void main(String[] args) {
            SpringApplication.run(AppMain.class,args);
        }
    }
    View Code

    【分析】

    【具体实现】

    POJO+JPA+SQL建表

    POJO类:
    @Table(name = "md_item")
    @Entity
    public class Product {
        @Id
        @GeneratedValue(strategy = GenerationType.IDENTITY)
        String id;
        String proid;
        String proauthor;
        String protitle;
        String probackcount;
        String probackermoney;
        String promoneypercent;
        String starttime;
        String endtime;
        String protype;
        String prostatus;
        String proimgpath;
        // get/set/toString
    }
    
    JPA:
    /**
     * extends JpaRepository<PoJo类,Key主键类型>
     */
    public interface ProductDao extends JpaRepository<Product,Long> {
    }
    
    SQL:
    DROP TABLE IF EXISTS `md_item`;
    CREATE TABLE `md_item` (
      `id` bigint(20) NOT NULL AUTO_INCREMENT,
      `proid` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
      `proauthor` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
      `protitle` varchar(64) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
      `probackcount` varchar(20) DEFAULT NULL,
      `probackermoney` varchar(32) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
      `promoneypercent` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
      `starttime` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
      `endtime` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
      `protype` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
      `prostatus` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
      `proimgpath` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL,
      PRIMARY KEY (`id`)
    ) ENGINE=InnoDB AUTO_INCREMENT=82 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
    准备工作

    封装HttpClient 实现获取网页代码和图片下载

    /**
     * 封装HttpClient,交给Spring容器管理
     */
    @Component
    public class HttpUtils {
        //httpclient连接池!
        private PoolingHttpClientConnectionManager clientConnectionManager;
    
        /**
         * 在构造方法中new一个
         */
        public HttpUtils() {
            this.clientConnectionManager = new PoolingHttpClientConnectionManager();
            //设置最大连接数
            this.clientConnectionManager.setMaxTotal(100);
            //设置每个主机的最大连接数
            this.clientConnectionManager.setDefaultMaxPerRoute(10);
        }
    
        /**
         * 使用get请求获得页面
         * @param url
         * @return 页面数据
         */
        public String doGetHtml(String url){
            //获取HttpClient对象
            CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.clientConnectionManager).build();
            //创建HttpGet请求对象,设置url地址
            HttpGet httpGet = new HttpGet(url);
            //设置请求信息
            httpGet.setConfig(this.getConfig());
            CloseableHttpResponse httpResponse=null;
            //使用HttpClient发起请求,获得相应
            try {
                httpResponse = httpClient.execute(httpGet);
                if(httpResponse.getStatusLine().getStatusCode()==200){
                    //判断Entity是否为空,如果不为空就可以使用EntityUtils
                    if(httpResponse.getEntity()!=null){
                        String content = EntityUtils.toString(httpResponse.getEntity(),"utf-8");
                        return content;
                    }else{
                        return "ERROR";
                    }
                }
            } catch (IOException e) {
                e.printStackTrace();
            }finally {
                if(httpResponse!=null){
                    try {
                        httpResponse.close();
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
            }
            //解析响应,返回结果
            return "";
        }
    
        /**
         * 设置RequestConfig
         * @return
         */
        private RequestConfig getConfig() {
            RequestConfig config=RequestConfig.custom()
                    .setConnectTimeout(1000)    //创建链接的最长时间
                    .setConnectionRequestTimeout(500)   //获取链接的最长时间
                    .setSocketTimeout(10000)    //数据传输的最长时间
                    .build();
            return config;
        }
    
        /**
         * 下载图片
         * @param url
         * @return 图片名称
         */
        public String doGetImage(String url){
    //获取HttpClient对象
            CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(this.clientConnectionManager).build();
            //创建HttpGet请求对象,设置url地址
            HttpGet httpGet = new HttpGet(url);
            //设置请求信息
            httpGet.setConfig(this.getConfig());
            CloseableHttpResponse httpResponse=null;
            //使用HttpClient发起请求,获得相应
            try {
                httpResponse = httpClient.execute(httpGet);
                if(httpResponse.getStatusLine().getStatusCode()==200){
                    //判断Entity是否为空,如果不为空就可以使用EntityUtils
                    if(httpResponse.getEntity()!=null){
                        //下载图片
                        //获取图片后缀
                        String exName=url.substring(url.lastIndexOf("."));
                        //创建图片名,重命名图片
                        String picName= UUID.randomUUID().toString()+exName;
                        //下载图片
                        OutputStream outputStream=new FileOutputStream(new File("G:/IJDailyCode/Crawler/src/main/resources/downloadImg/"+picName));
                        httpResponse.getEntity().writeTo(outputStream);
                        //返回图片名称
                        return picName;
                    }else{
                        return "ERROR";
                    }
                }
            } catch (IOException e) {
                e.printStackTrace();
            }finally {
                if(httpResponse!=null){
                    try {
                        httpResponse.close();
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
            }
            //出现其他问题返回结果
            return "";
        }

    任务方法:

    @Component
    public class GetTask {
        @Autowired
        HttpUtils httpUtils;
        @Autowired
        ProductService productService;
    
        @Scheduled(fixedDelay = 100*1000)
        public void crawlerMain(){
            String mainUrl="https://zhongchou.modian.com/all/top_time/going/";
            /*页码for循环*/
            for(int page=1;page<=8;++page){
                String tempUrl=mainUrl+page;
                String html = httpUtils.doGetHtml(tempUrl);
                /*解析页面,获取商品数据*/
                this.parse(html);
            }
        }
    
        private void parse(String html) {
            Document doc = Jsoup.parse(html);
            Elements proElms = doc.select("div.pro_field > ul > li");
            for (Element proElm:proElms) {
                /**
                 * 乱七八糟的搜寻匹配项
                 */
                String proId = proElm.attr("data-pro-id");
                String proAuthor=proElm.select("div.author > a > p").text();
                // 重复的使用选择器找信息。。。。。。。。。。。。。
                String imgUrl=infoDoc.getElementById("big_logo").attr("src");
    
                //如果实体的属性是null,它就会忽略它,这里只传一个proId参数就好
                Product proExample=new Product();
                proExample.setProid(proId);
                
                //查询并判断数据是否存在
                List<Product> examples = productService.findAll(proExample);
                System.out.println("list有无数据:"+examples.size());
                if(examples.size()>0){
                    System.out.println("===数据已存在===");
                    continue;
                }
                
                /*绑定数值*/
                proExample.setProauthor(proAuthor);
                // 重复的数值绑定操作。。。。。。。。。
                proExample.setProstatus(proStatus);
                /*下载图片*/
                String proImgPath=httpUtils.doGetImage(imgUrl);
                proExample.setProimgpath(proImgPath);
                /*提交保存*/
                productService.save(proExample);
            }
        }
    }    

    最没用的Service

    @Service
    @Transactional
    public class ProductServiceImpl implements ProductService {
        @Autowired
        private ProductDao productDao;
        @Override
        public void save(Product product) {
            this.productDao.save(product);
        }
    
        @Override
        public List<Product> findAll(Product product) {
            Example<Product> example = Example.of(product);
            List<Product> products = productDao.findAll(example);
            return products;
        }
    }

    【保存结果展示】

    【遇到问题】

      网页get下来本身的数据就是NULL,无法判断,导致保存失败。懒得加判断了。

      【重要】在判断数据是否重复时,使用JPARepository的findAll的Example方式,提供一个POJO模板进行自动查询,返回的List.size()始终为0,导致重复数据还会保存。

      【重要】@Table(name=" 表名")报红

  • 相关阅读:
    使用RecyclerView打造Gallery
    Retrofit简介与使用方法(翻译)
    迷宫实现递归版本C++
    牛客笔试题
    牛客笔试题---求最长重复词长度之和
    C++句柄解析
    C++双向循环链表实现
    String C++完整实现。
    String写时拷贝实现
    顺序表操作补充(查找方法增加)
  • 原文地址:https://www.cnblogs.com/YFEYI/p/15232902.html
Copyright © 2011-2022 走看看