zoukankan      html  css  js  c++  java
  • webmagic 爬取网页所有文章的标题时间作者和内容

    package com.ij34;
    
    import us.codecraft.webmagic.Site;
    import us.codecraft.webmagic.Page;
    import us.codecraft.webmagic.Spider;
    import us.codecraft.webmagic.pipeline.FilePipeline;
    import us.codecraft.webmagic.processor.PageProcessor;
    
    import java.util.List;
    
    public class HuxiuTest implements PageProcessor {
        @Override
        public void process(Page page) {
            List<String> requests = page.getHtml().links().regex(".*article.*").all();
             page.addTargetRequests(requests);
             page.putField("标题",page.getHtml().xpath("//div[@class='wrap-left pull-left']//h1/text()"));
             page.putField("作者", page.getHtml().xpath("//div[@class='article-author']//a/text()"));
             page.putField("时间", page.getHtml().xpath("//div[@class='column-link-box']/span[1]//text()"));
             page.putField("内容",page.getHtml().xpath("//div[@class='article-content-wrap']"));
    
        }
        @Override
        public Site getSite() {
            return Site.me().setDomain("www.huxiu.com");
        }
    
        public static void main(String[] args) {
            Spider.create(new HuxiuTest()).addUrl("https://www.huxiu.com/")
           .addPipeline(new FilePipeline("D:\webmagic\data\"))
            .run();
        }
    
    }
  • 相关阅读:
    启动时间知多少?8款音视频类应用测评报告分析
    优化信息流很麻烦?三招教你轻松搞定
    vmstat
    iostat
    dstat
    strace
    Mysql(一)
    rsync
    Kubernetes-深入分析集群安全机制
    Kubernetes-apiserver
  • 原文地址:https://www.cnblogs.com/tk55/p/8646016.html
Copyright © 2011-2022 走看看