zoukankan      html  css  js  c++  java
  • 使用Java Jsoup爬取网页内容(存入本地并从本地读取)

    GetPageInfo 获取数据、存入本地、从本地读取数据

    
    import lombok.SneakyThrows;
    import org.jsoup.Connection;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    
    import java.io.*;
    
    public class GetPageInfo {
        public static void main(String[] args) throws Exception {
            // 获取网页数据并保存到本地
            String casData = "";        // 创建一个StringBuilder用于存储爬取到并处理过的数据,存入本地
            int count = 0;
            Integer pages = 10;   // 爬取的页数
            for(int i = 1;i  <= pages;i ++){
                String url = "https://www.xxx.com.cn/list/?p=" + (i + "");
                // url = url.replace("pageNum",i + "");
                String cas = getCas(url);       // 调用爬取方法传入url
                System.out.println("第" + i+"页数据获取成功");
                casData += cas + " ";        // 字符串追加 每次追加完成后加个空格
                // 每10页写入一次,提高效率
                if(i % 10 == 0){
                    if(count == 0){
                        //将文件写入本地
                        writeOcrStrtoFile(casData,"F:\nistCasData","cas.txt");
                        System.out.println("第" + i/10 +"次保存成功");
                        casData = "";
                        count ++;
                    }else{
                        String tempRead = readFileByLines("F:\nistCasData\cas.txt");
                        tempRead += casData;
                        writeOcrStrtoFile(tempRead,"F:\nistCasData","cas.txt");
                        System.out.println("第" + i/10 +"次保存成功");
                        casData = "";
                    }
                }
            }
    
            // 将剩下i % 10 != 0的数据写入
            // 读取本地文件
            String readData = readFileByLines("F:\nistCasData\cas.txt");
            readData += casData;
            writeOcrStrtoFile(readData,"F:\nistCasData","cas.txt");
    
    //        String[] arr = readData.split("\s+");  // 分割一个或者多个空格
    //        for(int i = 0;i < arr.length;i ++){
    //            System.out.println(i + ":" + arr[i]);
    //        }
        }
    
        /**
         * 获取网页数据
         * @param url
         */
        @SneakyThrows
        public static String getCas(String url){
            // 如果报错,忽略url的https证书;http开头的应该可以不用处理
            HttpsUrlValidator.retrieveResponseFromServer(url);
            // 加入url并编写请求头,打开浏览器控制台照着写
            Connection.Response response = Jsoup
                    .connect(url)
                    .header("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9")
                    .header("Accept-Encoding","*/*")
                    .header("Accept-Language","zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,ja-JP;q=0.6,ja;q=0.5,ko-KR;q=0.4,ko;q=0.3")
                    .header("Connection","keep-alive")
                    .header("Content-Type","application/json;charset=UTF-8")
                    .header("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36")
                    .timeout(10000)     // 设置超时时间
                    .ignoreContentType(true)
                    .execute();
    
            String html = response.body();      // 获取到的html字符串
            Document doc = Jsoup.parse(html); // 使用jsoup 进行语言转换
    //        System.out.println(doc.select(".alink").size());        // 查看class="alink"的个数
            String cas = doc.select(".alink").text();       // 获取class="alink"的数据
            return  cas;
        }
    
        /**
         * 保存文件到本地
         * @param result  需要写入的数据
         * @param outPath   保存的路径
         * @param outFileName   保存的文件名
         * @throws Exception
         */
        public static void writeOcrStrtoFile(String result, String outPath, String outFileName) throws Exception {
            File dir = new File(outPath);
            if(!dir.exists()) {
                dir.mkdirs();
            }
            File txt = new File(outPath + "/" + outFileName);
            // 先删除;否则会直接追加在之前的内容后面,成几何倍数增长
            if (txt.isFile() && txt.exists()) {
                txt.delete();
            }
            // 再创建
            if (!txt.exists()) {
                txt.createNewFile();
            }
            byte bytes[] = new byte[512];
            bytes = result.getBytes();
            int b = bytes.length; // 是字节的长度,不是字符串的长度
            FileOutputStream fos = new FileOutputStream(txt);
            fos.write(bytes);
            fos.flush();
            fos.close();
        }
    
        /**
         * 读取本地文件(按行读取),因为存的时候没换行,所以按行读取
         * @param fileName  文件名
         */
        public static String readFileByLines(String fileName) {
            File file = new File(fileName);
            String readData = "";
            BufferedReader reader = null;
            try {
                String tempString = null;
                reader = new BufferedReader(new FileReader(file));
                // 一次读一行,读入null时文件结束
                while ((tempString = reader.readLine()) != null) {
                    readData += tempString;
                }
                reader.close();
            } catch (Exception e) {
                e.printStackTrace();
            } finally {
                if (reader != null) {
                    try {
                        reader.close();
                    } catch (IOException e1) {
                        e1.printStackTrace();
                    }
                }
            }
            return readData;
    
        }
    
    
    }
    
    

    忽略https证书(http应该不需要,没试过)

    
    import java.io.BufferedReader;
    import java.io.IOException;
    import java.io.InputStreamReader;
    import java.net.HttpURLConnection;
    import java.net.URL;
    
    import javax.net.ssl.HostnameVerifier;
    import javax.net.ssl.HttpsURLConnection;
    import javax.net.ssl.SSLSession;
    
    
    public class HttpsUrlValidator {
    
        static HostnameVerifier hv = new HostnameVerifier() {
            public boolean verify(String urlHostName, SSLSession session) {
                System.out.println("Warning: URL Host: " + urlHostName + " vs. "
                        + session.getPeerHost());
                return true;
            }
        };
    
        public final static String retrieveResponseFromServer(final String url) {
            HttpURLConnection connection = null;
    
            try {
                URL validationUrl = new URL(url);
                trustAllHttpsCertificates();
                HttpsURLConnection.setDefaultHostnameVerifier(hv);
    
                connection = (HttpURLConnection) validationUrl.openConnection();
                final BufferedReader in = new BufferedReader(new InputStreamReader(
                        connection.getInputStream()));
    
                String line;
                final StringBuffer stringBuffer = new StringBuffer(255);
    
                synchronized (stringBuffer) {
                    while ((line = in.readLine()) != null) {
                        stringBuffer.append(line);
                        stringBuffer.append("
    ");
                    }
                    return stringBuffer.toString();
                }
    
            } catch (final IOException e) {
                System.out.println(e.getMessage());
                return null;
            } catch (final Exception e1){
                System.out.println(e1.getMessage());
                return null;
            }finally {
                if (connection != null) {
                    connection.disconnect();
                }
            }
        }
    
        public static void trustAllHttpsCertificates() throws Exception {
            javax.net.ssl.TrustManager[] trustAllCerts = new javax.net.ssl.TrustManager[1];
            javax.net.ssl.TrustManager tm = new miTM();
            trustAllCerts[0] = tm;
            javax.net.ssl.SSLContext sc = javax.net.ssl.SSLContext
                    .getInstance("SSL");
            sc.init(null, trustAllCerts, null);
            javax.net.ssl.HttpsURLConnection.setDefaultSSLSocketFactory(sc
                    .getSocketFactory());
        }
    
        static class miTM implements javax.net.ssl.TrustManager,
                javax.net.ssl.X509TrustManager {
            public java.security.cert.X509Certificate[] getAcceptedIssuers() {
                return null;
            }
    
            public boolean isServerTrusted(
                    java.security.cert.X509Certificate[] certs) {
                return true;
            }
    
            public boolean isClientTrusted(
                    java.security.cert.X509Certificate[] certs) {
                return true;
            }
    
            public void checkServerTrusted(
                    java.security.cert.X509Certificate[] certs, String authType)
                    throws java.security.cert.CertificateException {
                return;
            }
    
            public void checkClientTrusted(
                    java.security.cert.X509Certificate[] certs, String authType)
                    throws java.security.cert.CertificateException {
                return;
            }
        }
    
    }
    
  • 相关阅读:
    ES6学习--函数剩余参数 (rest参数)
    ES6学习 --函数参数默认值与解构赋值默认值
    ES6学习--Array.from()方法
    02ython基础知识(一)
    01 Python初探
    c#利用IronPython调用python的过程种种问题
    Android 对话框(Dialogs)
    不可不知的安卓屏幕知识
    C#中的Split用法以及详解
    关于XML文档操作类
  • 原文地址:https://www.cnblogs.com/zhangzimuzjq/p/13578607.html
Copyright © 2011-2022 走看看