zoukankan      html  css  js  c++  java
  • Java爬页面数据

    
    
     <!--爬数据 start-->
            <dependency>
                <groupId>org.apache.httpcomponents</groupId>
                <artifactId>httpclient</artifactId>
                <version>4.5.9</version>
            </dependency>
            <dependency>
                <groupId>net.sourceforge.htmlunit</groupId>
                <artifactId>htmlunit</artifactId>
                <version>2.27</version>
            </dependency>
            <dependency>
                <groupId>org.jsoup</groupId>
                <artifactId>jsoup</artifactId>
                <version>1.11.3</version>
            </dependency>
    <!--爬数据 end-->

    一.创建你要爬取的字段实体
    package com.tecnon.common.utils; import lombok.Data; @Data public class POItoExcel { /** * 书名 */ private String bookName; /** * 价格 */ private String price; /** * 作者 */ private String author; /** * 出版社 */ private String Press; /** * 出版时间 */ private String pressTime; } 二.单元测试实现代码 这是我要爬取的页面链接:https://www.bookuu.com/search.php?cid=101702 实现单元测试 public static void main(String[] args) { List<POItoExcel> poItoExcelList = new ArrayList<>(); for (int i = 1; i <= 2; i++) { String url = "https://www.bookuu.com/search.php?cid=101702&page=" + i; try { Document document = Jsoup.connect(url).header("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36") .header("referer", "https://www.bookuu.com/search.php?cid=101702&page=" + i).get(); Element body = document.body(); Elements a = body.getElementsByClass("wd-640"); for (Iterator it = a.iterator(); it.hasNext(); ) { POItoExcel poItoExcel = new POItoExcel(); Element e = (Element) it.next(); Elements bn = e.getElementsByClass("fs-16"); Elements p = e.getElementsByClass("fs-21"); Elements w = e.getElementsByClass("wd-30p fl to-hd mr-10"); Elements f = e.getElementsByClass("wd-30p fl to-hd cl-9 mr-10"); Elements t = e.getElementsByClass("wd-30p fl to-hd cl-9"); //爬到的数据放到list中 poItoExcel.setBookName(bn.text()); poItoExcel.setPrice(p.text()); poItoExcel.setAuthor(w.text()); poItoExcel.setPress(f.text()); poItoExcel.setPressTime(t.text()); poItoExcelList.add(poItoExcel); } } catch (Exception e) { e.printStackTrace(); } System.out.println("第" + i+ "页结束"); } System.out.println("----"+ StringUtil.getJsonFromObject(poItoExcelList) +"----"); } 有什么问题:加qq:501397578
  • 相关阅读:
    NOIP普及组2003经验总结
    Day6上午 DP练习题
    Day4 图论
    Day3 数据结构
    使用ettercap进行dns欺骗和获取目标浏览的图片
    flask入门
    攻防世界-web-unserialize3
    数据结构课设作业-----飞机订票系统
    bugku NaNNaNNaNNaN-Batman
    it's a test
  • 原文地址:https://www.cnblogs.com/thcy1314/p/13565166.html
Copyright © 2011-2022 走看看