zoukankan      html  css  js  c++  java
  • java实现爬取静态页面的新闻数据


    可能需要的pom依赖包:

    <!-- https://mvnrepository.com/artifact/commons-codec/commons-codec -->
    <dependency>
    <groupId>commons-codec</groupId>
    <artifactId>commons-codec</artifactId>
    <version>1.4</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
    <dependency>
    <groupId>org.apache.httpcomponents</groupId>
    <artifactId>httpclient</artifactId>
    <version>4.5.2</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/commons-logging/commons-logging -->
    <dependency>
    <groupId>commons-logging</groupId>
    <artifactId>commons-logging</artifactId>
    <version>1.1.1</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/commons-httpclient/commons-httpclient -->
    <dependency>
    <groupId>commons-httpclient</groupId>
    <artifactId>commons-httpclient</artifactId>
    <version>3.1</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
    <dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.8.3</version>
    </dependency>




    主要贴出实现类相关代码:


    @Override
    public boolean inserturlNews(String urls) {


    // TODO: 2021/5/17 只支持新民网数据爬取,可根据页面标签定时解析
    String url = urls;
    Document doc = null;
    try {
    doc = Jsoup.connect(url).get();
    Elements listDiv = doc.getElementsByAttributeValue("class", "type_content_list type-item");
    NewsInformation newsInformation= new NewsInformation();
    for (Element element : listDiv) {
    Elements texts = element.getElementsByTag("a");
    for (Element text : texts) {
    String newsUrl=text.attr("href");
    String ptext = text.attr("title");
    if (! ptext.isEmpty() && newsUrl.contains(".html")){
    newsInformation.setTitle(ptext);
    newsInformation.setNewsUrl(newsUrl);
    try {
    Document newsDoc = Jsoup.connect(newsUrl).get();
    newsInformation.setForm(newsDoc.select(".info").select("span").get(0).text());
    //环球,时政
    if (url.contains("http://newsxmwb.xinmin.cn/world/") || url.contains("http://newsxmwb.xinmin.cn/shizheng/")) {
    newsInformation.setAuthor(newsDoc.select(".info").select("span").get(1).text());
    newsInformation.setDataTime(newsDoc.select(".info").select("span").get(3).text());
    if (!newsInformation.getDataTime().contains("2021-")){
    newsInformation.setDataTime(now.format(fmTime));
    }
    //文、体会
    }else if (url.contains("http://newsxmwb.xinmin.cn/wentihui/")){
    newsInformation.setAuthor(newsDoc.select(".info").select("span").get(1).text());
    newsInformation.setDataTime(newsDoc.select(".info").select("span").get(4).text());
    if (!newsInformation.getDataTime().contains("2021-")){
    newsInformation.setDataTime(newsDoc.select(".info").select("span").get(3).text());
    }
    //头条
    } else if (url.contains("http://shanghai.xinmin.cn/t/gdbd/")){
    newsInformation.setAuthor(newsDoc.select(".info").select("span").get(1).text());
    newsInformation.setDataTime(newsDoc.select(".info").select("span").get(3).text());
    if (!newsInformation.getDataTime().contains("2021-")){
    newsInformation.setDataTime(now.format(fmTime));
    }
    }
    Elements listNewsDetail = newsDoc.getElementsByAttributeValue("class", "a_content");
    for (Element listNews : listNewsDetail) {
    Elements contents = listNews.getElementsByTag("p");
    Elements images = listNews.getElementsByTag("img");
    newsInformation.setImage(images.attr("src"));
    StringBuffer buffer =new StringBuffer();
    for (Element newsContent : contents) {
    buffer.append(newsContent.text().trim());
    }
    newsInformation.setContent(buffer.toString().trim());
    newsInformation.setStatus(1);
    }
    } catch (IOException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
    }
    newsMapperExt.inserturlNews(newsInformation);
    }
    }
    }


    } catch (IOException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
    }

    return true;
    }
    }


    插入本地数据库 展示



  • 相关阅读:
    利用selenroid扩展uiautoamtor的webview解析能力
    设备offline时如何自动重置
    Docker集群管理portainer的使用
    Dockerfile编写的注意事项
    @RequestMapping注解学习
    美团面试总结
    排序算法总结
    java实现二分法查找
    设计模式之---单例模式
    http请求状态码解析
  • 原文地址:https://www.cnblogs.com/yangsanluo/p/14845374.html
Copyright © 2011-2022 走看看