zoukankan      html  css  js  c++  java
  • Jsoup爬取京东和融e购商品列表工具类

    1.新建maven项目,添加Jsoup的依赖

    <?xml version="1.0" encoding="UTF-8"?>
    <project xmlns="http://maven.apache.org/POM/4.0.0"
             xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
             xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
        <modelVersion>4.0.0</modelVersion>
    
        <groupId>org.example</groupId>
        <artifactId>Jsoup-demo</artifactId>
        <version>1.0-SNAPSHOT</version>
    
    
        <dependencies>
            <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
            <dependency>
                <groupId>org.jsoup</groupId>
                <artifactId>jsoup</artifactId>
                <version>1.13.1</version>
            </dependency>
            <dependency>
                <groupId>org.projectlombok</groupId>
                <artifactId>lombok</artifactId>
                <version>1.18.12</version>
            </dependency>
        </dependencies>
    
    </project>

    2.新建实体类,代码如下:

     1 package cn.lxcourse.jsoup.pojo;
     2 
     3 import lombok.AllArgsConstructor;
     4 import lombok.Data;
     5 import lombok.NoArgsConstructor;
     6 
     7 @Data
     8 @NoArgsConstructor
     9 @AllArgsConstructor
    10 public class Content {
    11     private String price;
    12     private String title;
    13     private String imgSrc;
    14 }

    3.编写工具类,代码如下:

     1 package cn.lxcourse.jsoup.util;
     2 
     3 import cn.lxcourse.jsoup.pojo.Content;
     4 import org.jsoup.Jsoup;
     5 import org.jsoup.nodes.Document;
     6 import org.jsoup.nodes.Element;
     7 import org.jsoup.select.Elements;
     8 
     9 import java.net.URL;
    10 import java.util.ArrayList;
    11 import java.util.List;
    12 
    13 /**
    14  * 爬虫工具
    15  */
    16 public class JsoupUtils {
    17 
    18     /**
    19      * 爬取京东商品列表
    20      * @param keywords
    21      * @return
    22      * @throws Exception
    23      */
    24     public static List<Content> getJDGoods(String keywords) throws Exception {
    25         String url = "https://search.jd.com/Search?keyword=Java" + keywords;
    26         Document document = Jsoup.parse(new URL(url), 300000);
    27         //商品列表
    28         Element j_goodsList = document.getElementById("J_goodsList");
    29         Elements glEtemElements = j_goodsList.getElementsByClass("gl-item");
    30 
    31         List<Content> list = new ArrayList<>();
    32         for (Element element : glEtemElements) {
    33 
    34             String imgSrc = element.getElementsByTag("img").eq(0).attr("source-data-lazy-img");
    35             String price = element.getElementsByClass("p-price").eq(0).text();
    36             String title = element.getElementsByClass("p-name").eq(0).text();
    37 
    38             Content content = new Content();
    39             content.setImgSrc(imgSrc);
    40             content.setPrice(price);
    41             content.setTitle(title);
    42 
    43             list.add(content);
    44         }
    45 
    46         return list;
    47     }
    48 
    49     /**
    50      * 爬取工行融e购商品列表
    51      * @param keywords
    52      * @return
    53      * @throws Exception
    54      */
    55     public static List<Content> getRongYiGouGoods(String keywords) throws Exception {
    56         //https://mall.icbc.com.cn/searchproducts/pv.jhtml?query=java
    57         String url = "https://mall.icbc.com.cn/searchproducts/pv.jhtml?query=" + keywords;
    58 
    59         Document document = Jsoup.parse(new URL(url), 30000);
    60         Element ajaxQueryContent = document.getElementById("ajaxQueryContent");
    61 
    62         Elements liElements = ajaxQueryContent.getElementsByTag("li");
    63 
    64         List<Content> list = new ArrayList<>();
    65 
    66         for (Element el : liElements) {
    67             String src = el.getElementsByTag("img").eq(0).attr("src");
    68             String price = el.getElementsByClass("p-price").eq(0).text();
    69             String title = el.getElementsByClass("p-name").eq(0).select("a").eq(0).attr("title");
    70             Content content = new Content();
    71             content.setTitle(title);
    72             content.setPrice(price);
    73             content.setImgSrc(src);
    74             list.add(content);
    75         }
    76 
    77         return list;
    78     }
    79 
    80     public static void main(String[] args) throws Exception {
    81         //getJDGoods("Java").forEach(System.out::println);
    82         getRongYiGouGoods("java").forEach(System.out::println);
    83     }
    84 }
  • 相关阅读:
    HP惠普战66电源黄灯闪烁无法充电
    C#.NET rabbit mq 持久化时报错 durable
    手动解压安装mysql8.0 on windows my.ini
    C#.NET MySql8.0 EF db first
    EF MYSQL 出现:输入字符串的格式不正确
    EF MYSQL DB FIRST 出现2次数据库名
    mysql windows 下配置可远程连接
    团队项目的Git分支管理规范
    一个简单的软件测试流程
    微服务架构下的质量迷思——混沌工程
  • 原文地址:https://www.cnblogs.com/zhaoran8775/p/12773138.html
Copyright © 2011-2022 走看看