zoukankan      html  css  js  c++  java
  • 一个爬取https和http通用的工具类(JDK自带的URL的用法)

      今天在java爬取天猫的时候因为ssl报错,所以从网上找了一个可以爬取https和http通用的工具类。但是有的时候此工具类爬到的数据不全,此处不得不说python爬虫很厉害。

    package cn.qlq.craw.Jsoup;
    
    import java.io.File;
    import java.io.FileWriter;
    import java.io.Writer;
    import java.net.MalformedURLException;
    import java.net.URL;
    import java.security.SecureRandom;
    import java.security.cert.CertificateException;
    import java.security.cert.X509Certificate;
    import java.util.Map;
    
    import javax.net.ssl.HostnameVerifier;
    import javax.net.ssl.HttpsURLConnection;
    import javax.net.ssl.SSLContext;
    import javax.net.ssl.SSLSession;
    import javax.net.ssl.X509TrustManager;
    
    import org.jsoup.Connection;
    import org.jsoup.Jsoup;
    import org.jsoup.helper.HttpConnection;
    import org.jsoup.nodes.Document;  
      
    public class HttpCommonUtil {  
          
        public static void trustEveryone() {   
            try {    
                HttpsURLConnection.setDefaultHostnameVerifier(new HostnameVerifier() {    
                    public boolean verify(String hostname, SSLSession session) {    
                        return true;    
                    }    
                });    
        
                SSLContext context = SSLContext.getInstance("TLS");    
                context.init(null, new X509TrustManager[] { new X509TrustManager() {    
                    public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException {    
                    }    
        
                    public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException {    
                    }    
        
                    public X509Certificate[] getAcceptedIssuers() {    
                        return new X509Certificate[0];    
                    }    
                } }, new SecureRandom());    
                HttpsURLConnection.setDefaultSSLSocketFactory(context.getSocketFactory());    
            } catch (Exception e) {    
                e.printStackTrace();    
            }    
        }    
        
        public static Object getHttpHeaders(URL url, int timeout) {    
            try {    
                trustEveryone();   
                Connection conn = HttpConnection.connect(url);    
                conn.timeout(timeout);    
                conn.header("Accept-Encoding", "gzip,deflate,sdch");    
                conn.header("Connection", "close");    
                conn.get();    
                //String result=conn.response().body();  
                Map<String, String> result = conn.response().headers();    
                result.put("title", conn.response().parse().title());    
                return result;   
            } catch (Exception e) {    
                e.printStackTrace();    
            }    
            return null;    
        }    
        
        public static Object getHttpBody(URL url, int timeout) {    
            try {    
                trustEveryone();   
                Connection conn = HttpConnection.connect(url);    
                conn.timeout(timeout);    
                conn.header("Accept-Encoding", "gzip,deflate,sdch");    
                conn.header("Connection", "close");    
                conn.get();    
                //String result=conn.response().body();  
    //            String result = conn.response().body();    
                String result = conn.response().body();    
                File file = new File("C:\Users\liqiang\Desktop\实习\python\javaCrawPicture\tianmao.html");
                if(!file.exists()){
                    file.createNewFile();
                }else{
                    file.delete();
                }
                file.createNewFile();
                Writer fileWriter = new FileWriter(file); 
                fileWriter.write(result);
                fileWriter.close();
                return result;   
            } catch (Exception e) {    
                e.printStackTrace();    
            }    
            return null;    
        }    
        
        
        
        public static void main(String[] args) {    
            try {    
                URL url = new URL("http", "www.tmall.com", -1, "");   
                System.out.println(getHttpBody(url, 100000));
            } catch (MalformedURLException e) {    
                e.printStackTrace();    
            }    
        }    
    }  
  • 相关阅读:
    Zabbix配置文件详解之服务端zabbix_server
    Ansible批量远程管理Windows主机(部署与配置)
    ansible简要说明
    zabbix自动发现与自动注册
    Linux获取UUID
    python爬虫练习之批量下载zabbix文档
    cmake编译c++程序
    spring中PropertyPlaceholderConfigurer的运用---使用${property-name}取值
    spring中<bean>中parent标签的使用
    用静态工厂的方法实例化bean
  • 原文地址:https://www.cnblogs.com/qlqwjy/p/8886976.html
Copyright © 2011-2022 走看看