zoukankan      html  css  js  c++  java
  • 一个爬取https和http通用的工具类(JDK自带的URL的用法)

      今天在java爬取天猫的时候因为ssl报错,所以从网上找了一个可以爬取https和http通用的工具类。但是有的时候此工具类爬到的数据不全,此处不得不说python爬虫很厉害。

    package cn.qlq.craw.Jsoup;
    
    import java.io.File;
    import java.io.FileWriter;
    import java.io.Writer;
    import java.net.MalformedURLException;
    import java.net.URL;
    import java.security.SecureRandom;
    import java.security.cert.CertificateException;
    import java.security.cert.X509Certificate;
    import java.util.Map;
    
    import javax.net.ssl.HostnameVerifier;
    import javax.net.ssl.HttpsURLConnection;
    import javax.net.ssl.SSLContext;
    import javax.net.ssl.SSLSession;
    import javax.net.ssl.X509TrustManager;
    
    import org.jsoup.Connection;
    import org.jsoup.Jsoup;
    import org.jsoup.helper.HttpConnection;
    import org.jsoup.nodes.Document;  
      
    public class HttpCommonUtil {  
          
        public static void trustEveryone() {   
            try {    
                HttpsURLConnection.setDefaultHostnameVerifier(new HostnameVerifier() {    
                    public boolean verify(String hostname, SSLSession session) {    
                        return true;    
                    }    
                });    
        
                SSLContext context = SSLContext.getInstance("TLS");    
                context.init(null, new X509TrustManager[] { new X509TrustManager() {    
                    public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {    
                    }    
        
                    public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {    
                    }    
        
                    public X509Certificate[] getAcceptedIssuers() {    
                        return new X509Certificate[0];    
                    }    
                } }, new SecureRandom());    
                HttpsURLConnection.setDefaultSSLSocketFactory(context.getSocketFactory());    
            } catch (Exception e) {    
                e.printStackTrace();    
            }    
        }    
        
        public static Object getHttpHeaders(URL url, int timeout) {    
            try {    
                trustEveryone();   
                Connection conn = HttpConnection.connect(url);    
                conn.timeout(timeout);    
                conn.header("Accept-Encoding", "gzip,deflate,sdch");    
                conn.header("Connection", "close");    
                conn.get();    
                //String result=conn.response().body();  
                Map<String, String> result = conn.response().headers();    
                result.put("title", conn.response().parse().title());    
                return result;   
            } catch (Exception e) {    
                e.printStackTrace();    
            }    
            return null;    
        }    
        
        public static Object getHttpBody(URL url, int timeout) {    
            try {    
                trustEveryone();   
                Connection conn = HttpConnection.connect(url);    
                conn.timeout(timeout);    
                conn.header("Accept-Encoding", "gzip,deflate,sdch");    
                conn.header("Connection", "close");    
                conn.get();    
                //String result=conn.response().body();  
    //            String result = conn.response().body();    
                String result = conn.response().body();    
                File file = new File("C:\Users\liqiang\Desktop\实习\python\javaCrawPicture\tianmao.html");
                if(!file.exists()){
                    file.createNewFile();
                }else{
                    file.delete();
                }
                file.createNewFile();
                Writer fileWriter = new FileWriter(file); 
                fileWriter.write(result);
                fileWriter.close();
                return result;   
            } catch (Exception e) {    
                e.printStackTrace();    
            }    
            return null;    
        }    
        
        
        
        public static void main(String[] args) {    
            try {    
                URL url = new URL("http", "www.tmall.com", -1, "");   
                System.out.println(getHttpBody(url, 100000));
            } catch (MalformedURLException e) {    
                e.printStackTrace();    
            }    
        }    
    }  
  • 相关阅读:
    【算法专题】多项式运算与生成函数
    【CodeForces】914 E. Palindromes in a Tree 点分治
    【BZOJ】1468: Tree(POJ1741) 点分治
    【BZOJ】2599: [IOI2011]Race 点分治
    【POJ】2142 The Balance 数论(扩展欧几里得算法)
    【算法专题】仙人掌图问题
    【BZOJ】2125: 最短路 圆方树(静态仙人掌)
    【BZOJ】4316: 小C的独立集 静态仙人掌
    【BZOJ】3039: 玉蟾宫 悬线法
    【BZOJ】1023: [SHOI2008]cactus仙人掌图 静态仙人掌(DFS树)
  • 原文地址:https://www.cnblogs.com/qlqwjy/p/8886976.html
Copyright © 2011-2022 走看看