zoukankan      html  css  js  c++  java
  • java实现爬取静态页面的新闻数据


    可能需要的pom依赖包:

    <!-- https://mvnrepository.com/artifact/commons-codec/commons-codec -->
    <dependency>
    <groupId>commons-codec</groupId>
    <artifactId>commons-codec</artifactId>
    <version>1.4</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
    <dependency>
    <groupId>org.apache.httpcomponents</groupId>
    <artifactId>httpclient</artifactId>
    <version>4.5.2</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/commons-logging/commons-logging -->
    <dependency>
    <groupId>commons-logging</groupId>
    <artifactId>commons-logging</artifactId>
    <version>1.1.1</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/commons-httpclient/commons-httpclient -->
    <dependency>
    <groupId>commons-httpclient</groupId>
    <artifactId>commons-httpclient</artifactId>
    <version>3.1</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
    <dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.8.3</version>
    </dependency>




    主要贴出实现类相关代码:


    @Override
    public boolean inserturlNews(String urls) {


    // TODO: 2021/5/17 只支持新民网数据爬取,可根据页面标签定时解析
    String url = urls;
    Document doc = null;
    try {
    doc = Jsoup.connect(url).get();
    Elements listDiv = doc.getElementsByAttributeValue("class", "type_content_list type-item");
    NewsInformation newsInformation= new NewsInformation();
    for (Element element : listDiv) {
    Elements texts = element.getElementsByTag("a");
    for (Element text : texts) {
    String newsUrl=text.attr("href");
    String ptext = text.attr("title");
    if (! ptext.isEmpty() && newsUrl.contains(".html")){
    newsInformation.setTitle(ptext);
    newsInformation.setNewsUrl(newsUrl);
    try {
    Document newsDoc = Jsoup.connect(newsUrl).get();
    newsInformation.setForm(newsDoc.select(".info").select("span").get(0).text());
    //环球,时政
    if (url.contains("http://newsxmwb.xinmin.cn/world/") || url.contains("http://newsxmwb.xinmin.cn/shizheng/")) {
    newsInformation.setAuthor(newsDoc.select(".info").select("span").get(1).text());
    newsInformation.setDataTime(newsDoc.select(".info").select("span").get(3).text());
    if (!newsInformation.getDataTime().contains("2021-")){
    newsInformation.setDataTime(now.format(fmTime));
    }
    //文、体会
    }else if (url.contains("http://newsxmwb.xinmin.cn/wentihui/")){
    newsInformation.setAuthor(newsDoc.select(".info").select("span").get(1).text());
    newsInformation.setDataTime(newsDoc.select(".info").select("span").get(4).text());
    if (!newsInformation.getDataTime().contains("2021-")){
    newsInformation.setDataTime(newsDoc.select(".info").select("span").get(3).text());
    }
    //头条
    } else if (url.contains("http://shanghai.xinmin.cn/t/gdbd/")){
    newsInformation.setAuthor(newsDoc.select(".info").select("span").get(1).text());
    newsInformation.setDataTime(newsDoc.select(".info").select("span").get(3).text());
    if (!newsInformation.getDataTime().contains("2021-")){
    newsInformation.setDataTime(now.format(fmTime));
    }
    }
    Elements listNewsDetail = newsDoc.getElementsByAttributeValue("class", "a_content");
    for (Element listNews : listNewsDetail) {
    Elements contents = listNews.getElementsByTag("p");
    Elements images = listNews.getElementsByTag("img");
    newsInformation.setImage(images.attr("src"));
    StringBuffer buffer =new StringBuffer();
    for (Element newsContent : contents) {
    buffer.append(newsContent.text().trim());
    }
    newsInformation.setContent(buffer.toString().trim());
    newsInformation.setStatus(1);
    }
    } catch (IOException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
    }
    newsMapperExt.inserturlNews(newsInformation);
    }
    }
    }


    } catch (IOException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
    }

    return true;
    }
    }


    插入本地数据库 展示



  • 相关阅读:
    Revit 二次开发 几何
    Revit 二次开发 元素过滤练习
    Revit 二次开发 图元过滤
    Revit 二次开发 图元与参数
    扩展Revit的方式
    Revit API二次开发入门,完整学习流程,附源码
    Revit 二次开发学习视频
    在lldb调试中调用c++函数
    在lldb调试中调用c++函数
    lldb调试使用python脚本问题总结
  • 原文地址:https://www.cnblogs.com/yangsanluo/p/14845374.html
Copyright © 2011-2022 走看看