zoukankan      html  css  js  c++  java
  • java爬虫系列第三讲-获取页面中绝对路径的各种方法

    在使用webmgiac的过程中,很多时候我们需要抓取连接的绝对路径,总结了几种方法,示例代码放在最后。

    以和讯网的一个页面为例:

    xpath方式获取

    log.info("{}", page.getHtml().xpath("//div[@id='cyldata']").links().all());
    log.info("{}", page.getHtml().xpath("//div[@id='cyldata']//a//@abs:href").all());
    

    xpath+css选择器方式获取

    log.info("{}", page.getHtml().xpath("//div[@id='cyldata']").css("a", "abs:href").all());
    

    css选择器方式获取

    log.info("{}", page.getHtml().css("div[id='cyldata']").css("a", "abs:href").all());
    log.info("{}", page.getHtml().css("div[id='cyldata']").links().all());
    log.info("{}", page.getHtml().css("div[id='cyldata'] a").links().all());
    log.info("{}", page.getHtml().css("div[id='cyldata'] a", "abs:href").all());
    

    jsoup方式获取

    for (Element element : Jsoup.parse(page.getRawText(), page.getRequest().getUrl()).select("#cyldata a")) {
        log.info("{}", element.attr("abs:href"));
        log.info("{}", element.absUrl("href"));
    }
    

    jsoup中stringutil工具类方式获取

    for (Element element : Jsoup.parse(page.getRawText(), page.getRequest().getUrl()).select("#cyldata a")) {
        log.info("{}", StringUtil.resolve(page.getRequest().getUrl(), element.attr("href")));
    }
    

    示例代码

    <?xml version="1.0" encoding="UTF-8"?>
    <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
             xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
        <modelVersion>4.0.0</modelVersion>
        <parent>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-parent</artifactId>
            <version>2.1.4.RELEASE</version>
            <relativePath/> <!-- lookup parent from repository -->
        </parent>
        <groupId>com.ady01</groupId>
        <artifactId>java-pachong</artifactId>
        <version>0.0.1-SNAPSHOT</version>
        <name>java-pachong</name>
        <description>java爬虫项目</description>
    
        <properties>
            <java.version>1.8</java.version>
        </properties>
    
        <dependencies>
            <dependency>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-starter</artifactId>
            </dependency>
    
            <dependency>
                <groupId>org.projectlombok</groupId>
                <artifactId>lombok</artifactId>
                <optional>true</optional>
            </dependency>
            <dependency>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-starter-test</artifactId>
                <scope>test</scope>
            </dependency>
    
            <!-- webmagic start -->
            <dependency>
                <groupId>us.codecraft</groupId>
                <artifactId>webmagic-core</artifactId>
                <version>0.7.3</version>
                <exclusions>
                    <exclusion>
                        <artifactId>fastjson</artifactId>
                        <groupId>com.alibaba</groupId>
                    </exclusion>
                    <exclusion>
                        <artifactId>commons-io</artifactId>
                        <groupId>commons-io</groupId>
                    </exclusion>
                    <exclusion>
                        <artifactId>commons-io</artifactId>
                        <groupId>commons-io</groupId>
                    </exclusion>
                    <exclusion>
                        <artifactId>fastjson</artifactId>
                        <groupId>com.alibaba</groupId>
                    </exclusion>
                    <exclusion>
                        <artifactId>fastjson</artifactId>
                        <groupId>com.alibaba</groupId>
                    </exclusion>
                    <exclusion>
                        <artifactId>log4j</artifactId>
                        <groupId>log4j</groupId>
                    </exclusion>
                    <exclusion>
                        <artifactId>slf4j-log4j12</artifactId>
                        <groupId>org.slf4j</groupId>
                    </exclusion>
                </exclusions>
            </dependency>
            <dependency>
                <groupId>us.codecraft</groupId>
                <artifactId>webmagic-extension</artifactId>
                <version>0.7.3</version>
            </dependency>
            <dependency>
                <groupId>us.codecraft</groupId>
                <artifactId>webmagic-selenium</artifactId>
                <version>0.7.3</version>
            </dependency>
            <dependency>
                <groupId>net.minidev</groupId>
                <artifactId>json-smart</artifactId>
                <version>2.2.1</version>
            </dependency>
            <!-- webmagic end -->
            <dependency>
                <groupId>com.alibaba</groupId>
                <artifactId>fastjson</artifactId>
                <version>1.2.49</version>
            </dependency>
            <dependency>
                <groupId>commons-lang</groupId>
                <artifactId>commons-lang</artifactId>
                <version>2.6</version>
            </dependency>
            <dependency>
                <groupId>commons-io</groupId>
                <artifactId>commons-io</artifactId>
                <version>2.6</version>
            </dependency>
            <dependency>
                <groupId>commons-codec</groupId>
                <artifactId>commons-codec</artifactId>
                <version>1.11</version>
            </dependency>
            <dependency>
                <groupId>commons-collections</groupId>
                <artifactId>commons-collections</artifactId>
                <version>3.2.2</version>
            </dependency>
        </dependencies>
    
        <build>
            <plugins>
                <plugin>
                    <groupId>org.springframework.boot</groupId>
                    <artifactId>spring-boot-maven-plugin</artifactId>
                </plugin>
            </plugins>
        </build>
    
    </project>
    
    package com.ady01.demo3;
    
    import lombok.extern.slf4j.Slf4j;
    import org.jsoup.Jsoup;
    import org.jsoup.helper.StringUtil;
    import org.jsoup.nodes.Element;
    import us.codecraft.webmagic.Page;
    import us.codecraft.webmagic.Request;
    import us.codecraft.webmagic.Site;
    import us.codecraft.webmagic.Spider;
    import us.codecraft.webmagic.processor.PageProcessor;
    
    /**
     * <b>description</b>:webmagic中获取绝对路径 <br>
     * <b>time</b>:2019/4/22 10:42 <br>
     * <b>author</b>:微信公众号:路人甲Java,专注于java技术分享(带你玩转 爬虫、分布式事务、异步消息服务、任务调度、分库分表、大数据等),喜欢请关注!
     */
    @Slf4j
    public class AbsHrefPageProcessor implements PageProcessor {
        Site site = Site.me().setSleepTime(1000);
    
        @Override
        public void process(Page page) {
            //获取超链接绝对路径的方式
            log.info("----------------------xpath方式获取------------------------");
            //xpath方式获取
            log.info("{}", page.getHtml().xpath("//div[@id='cyldata']").links().all());
            log.info("{}", page.getHtml().xpath("//div[@id='cyldata']//a//@abs:href").all());
    
            //xpath+css选择器方式获取
            log.info("----------------------xpath+css选择器方式获取------------------------");
            log.info("{}", page.getHtml().xpath("//div[@id='cyldata']").css("a", "abs:href").all());
    
            //css选择器方式获取
            log.info("----------------------css选择器方式获取------------------------");
            log.info("{}", page.getHtml().css("div[id='cyldata']").css("a", "abs:href").all());
            log.info("{}", page.getHtml().css("div[id='cyldata']").links().all());
            log.info("{}", page.getHtml().css("div[id='cyldata'] a").links().all());
            log.info("{}", page.getHtml().css("div[id='cyldata'] a", "abs:href").all());
    
            //jsoup方式获取
            log.info("----------------------jsoup方式获取------------------------");
            for (Element element : Jsoup.parse(page.getRawText(), page.getRequest().getUrl()).select("#cyldata a")) {
                log.info("{}", element.attr("abs:href"));
                log.info("{}", element.absUrl("href"));
            }
    
            //jsoup中stringutil工具类方式获取
            log.info("----------------------jsoup中stringutil工具类方式获取------------------------");
            for (Element element : Jsoup.parse(page.getRawText(), page.getRequest().getUrl()).select("#cyldata a")) {
                log.info("{}", StringUtil.resolve(page.getRequest().getUrl(), element.attr("href")));
            }
        }
    
        @Override
        public Site getSite() {
            return site;
        }
    
        public static void main(String[] args) {
            Request request = new Request("http://industry.hexun.com/c193_59.shtml");
            Spider.create(new AbsHrefPageProcessor()).addRequest(request).run();
        }
    }
    

    ​执行结果:

  • 相关阅读:
    Java运行时内存
    java --对象流与对象的序列化
    Java 文件操作
    爬虫
    eclipse项目放到github
    越来越玄的JAVA
    map和set的遍历
    集合总览
    unsafe类
    狡诈的java并发容器
  • 原文地址:https://www.cnblogs.com/itsoku123/p/10748950.html
Copyright © 2011-2022 走看看