zoukankan      html  css  js  c++  java
  • Java使用Jsoup获得新闻联播所有文字稿

    Jsoup的maven坐标:

    		<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
    		<dependency>
    		    <groupId>org.jsoup</groupId>
    		    <artifactId>jsoup</artifactId>
    		    <version>1.11.3</version>
    		</dependency>
    

    Java代码:

    package com.zifeiy.test;
    
    import java.io.File;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.io.OutputStreamWriter;
    import java.util.ArrayList;
    import java.util.List;
    
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    public class XinwenGetter {
    	
    	private static List<String> urlList = new ArrayList<String>();
    	
    	private static OutputStreamWriter out;
    	
    	private static void getUrlList() throws IOException {
    		for (int i = 1; i <= 44; i ++) {
    			String url = null;
    			if (i == 0) {
    				url = "http://www.xwlbo.com/txt.html";
    			} else {
    				url = "http://www.xwlbo.com/txt_" + i + ".html";
    			}
    			Document doc = Jsoup.connect(url).get();
    			Elements xwlistElements = doc.getElementsByClass("xwlist");
    			Elements aElements = xwlistElements.get(0).select("a");
    			for (Element element : aElements) {
    				String resUrl = element.attr("href");
    				urlList.add(resUrl);
    			}
    		}
    	}
    	
    	private static void solve(String url) throws IOException {
    		Document doc = Jsoup.connect(url).get();
    		System.out.println("handling " + doc.title() + " ...");
    		out.write("<h3>" + doc.title() + "</h3>
    ");
    		Elements textElements = doc.getElementsByClass("text_content");
    		Elements pElements = textElements.get(0).select("p");
    		for (Element pElement : pElements) {
    //			System.out.println(pElement);
    			out.write(pElement.toString() + "
    ");
    		}
    		out.write("<hr>
    ");
    	}
    	
    	
    	public static void main(String[] args) throws IOException {
    		
    		getUrlList();
    		
    		File file = new File("D:/新闻联播大全.html");
    		if (file.exists() == true) file.delete();
    		out = new OutputStreamWriter(new FileOutputStream(file, true), "UTF-8");
            
    		for (String url: urlList) {
    			solve(url);
    		}
    		
            out.close();
    		
    	}
    	
    }
    
    
  • 相关阅读:
    vue表单:输入身份证号码则自动获取对应的年龄和性别,,若不输入身份证号则自己填写年龄和性别
    el-input 电话号码输入时加上空格(344)
    vue图片上传---融合裁剪功能
    shell 基本编程
    virtualbox 安装centos ,运行shell 脚本
    js 检测变量类型
    js deepCopy
    python 安装requests库
    python 识别文件 文件夹
    python 删除非空文件夹
  • 原文地址:https://www.cnblogs.com/zifeiy/p/10527621.html
Copyright © 2011-2022 走看看