zoukankan      html  css  js  c++  java
  • HtmlUnit的工具类(请求头,JavaScript,AJAX,验证代理服务器)

    一. HtmlUnitUtils

    package org.spider.htmlunit;
    
    import com.gargoylesoftware.htmlunit.BrowserVersion;
    import com.gargoylesoftware.htmlunit.DefaultCredentialsProvider;
    import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
    import com.gargoylesoftware.htmlunit.WebClient;
    import com.gargoylesoftware.htmlunit.html.HtmlPage;
    import com.gargoylesoftware.htmlunit.util.Cookie;
    import org.jsoup.helper.StringUtil;
    
    import java.io.IOException;
    import java.util.HashMap;
    import java.util.Map;
    
    /**
     * @description:
     * @author: long.li
     * @date: 2019/2/20 14:40
     */
    public class HtmlUnitUtils {
        public static void main(String[] args) throws Exception {
            HtmlUnitBuilder builder = HtmlUnitBuilder.config()
                    .url("www.baidu.com")
                    .enableJS(true)
                    .enableCookie(true);
            getPage(builder).asText();
        }
    
    
        public static HtmlPage getPage(HtmlUnitBuilder builder)throws Exception{
            WebClient webClient = getWebClient(builder);
            int count = -1;
            while(true){
                try {
                    count++;
                    return webClient.getPage(builder.url());
                }
                catch (Exception e){
                    if(e instanceof IOException && count < builder.retry()){
                        //日志打印:e,重试次数:i,再次执行
                    }else {//如果url错误等情况
                        //执行失败,抛出异常
                        throw e;
                    }
                }
            }
        }
    
        public static WebClient getWebClient(HtmlUnitBuilder builder){
            WebClient webClient = null;
            if(isBlank(builder.proxyHost())){
                webClient = new WebClient(BrowserVersion.CHROME);
            }else{
                webClient = new WebClient(BrowserVersion.CHROME,builder.proxyHost(),builder.proxyPort());
                if(!isBlank(builder.username())){ //需要验证的代理服务器
                    ((DefaultCredentialsProvider) webClient.getCredentialsProvider()).
                            addCredentials(builder.username(),builder.password());
                }
            }
            //浏览器基本设置
            webClient.getOptions().setThrowExceptionOnScriptError(false);//当JS执行出错的时候是否抛出异常
            webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);//当HTTP的状态非200时是否抛出异常
            webClient.getOptions().setCssEnabled(builder.enableCSS());//是否启用CSS
            webClient.getOptions().setJavaScriptEnabled(builder.enableJS()); //默认设置为禁用
            if(builder.enableAjax()) {
                webClient.setAjaxController(new NicelyResynchronizingAjaxController());//设置支持AJAX
            }
    
            if(builder.enableCookie()){
                webClient.getCookieManager().setCookiesEnabled(true);
                for(Map.Entry<String,String> pair:builder.cookies().entrySet()){
                    webClient.getCookieManager().addCookie(new Cookie("/",pair.getKey(),pair.getKey()));
                }
    
            }
            webClient.waitForBackgroundJavaScript(builder.waitForBackgroundJavaScript());
            Map<String,String> headers = builder.headers();
            if(headers!=null&&headers.size()>0){
                for(Map.Entry<String,String> header:headers.entrySet()){
                    webClient.addRequestHeader(header.getKey(),header.getValue());
                }
            }
            return webClient;
        }
    
        /**
         * org.jsoup.helper.StringUtil
         * @param string
         * @return
         */
        private static boolean isBlank(String string) {
            if (string == null || string.length() == 0)
                return true;
    
            int l = string.length();
            for (int i = 0; i < l; i++) {
                if (!StringUtil.isWhitespace(string.codePointAt(i)))
                    return false;
            }
            return true;
        }
        private static boolean isWhitespace(int c){
            return c == ' ' || c == '	' || c == '
    ' || c == 'f' || c == '
    ';
        }
    }
    
    

    二. HtmlUnitBuilder

    package org.spider.htmlunit;
    
    
    import java.util.HashMap;
    import java.util.Map;
    
    /**
     * @description:
     * @author: long.li
     * @date: 2019/2/20 14:40
     */
    public class HtmlUnitBuilder {
    
    
        private String url; //请求URL
        private String proxyHost; //代理服务器地址
        private int proxyPort; //代理服务器端口
        private String username; //代理服务器账户
        private String password; //代理服务器密码
        private boolean enableCSS = false; //CSS支持
        private boolean enableJS = false; //JavaScript支持
        private boolean enableAjax = false; //Ajax支持
        private boolean enableCookie = false;//cookie支持
        private int waitForBackgroundJavaScript = 0; //等待JS加载时间
        private int retry = 0; //请求异常重试次数
        private Map<String, String> headers = new HashMap<>(); //请求头参数
        private Map<String, String> cookies = new HashMap<>(); //cookie
    
    
        public static HtmlUnitBuilder config() {
            return new HtmlUnitBuilder();
        }
    
        public HtmlUnitBuilder url(String url) {
            this.url = url;
            return this;
        }
    
        public HtmlUnitBuilder retry(int retry) {
            this.retry = retry;
            return this;
        }
    
        /**
         * 不需要验证的代理服务器
         *
         * @param proxyHost
         * @param proxyPort
         * @return
         */
        public HtmlUnitBuilder proxy(String proxyHost, int proxyPort) {
            this.proxyHost = proxyHost;
            this.proxyPort = proxyPort;
            return this;
        }
    
        /**
         * 需要验证的代理服务器
         *
         * @param proxyHost
         * @param proxyPort
         * @param username
         * @param password
         * @return
         */
        public HtmlUnitBuilder proxy(String proxyHost, int proxyPort,
                                     String username, String password) {
            this.proxyHost = proxyHost;
            this.proxyPort = proxyPort;
            this.username = username;
            this.password = password;
            return this;
        }
    
        public HtmlUnitBuilder enableCSS(boolean enableCSS) {
            this.enableCSS = enableCSS;
            return this;
        }
    
        public HtmlUnitBuilder enableJS(boolean enableJS) {
            this.enableJS = enableJS;
            return this;
        }
    
        public HtmlUnitBuilder enableAjax(boolean enableAjax) {
            this.enableAjax = enableAjax;
            return this;
        }
    
        public HtmlUnitBuilder enableCookie(boolean enableCookie) {
            this.enableCookie = enableCookie;
            return this;
        }
    
        public HtmlUnitBuilder cookies(Map<String, String> cookies){
            this.cookies = cookies;
            return this;
        }
    
    
    
        /**
         * 设置新的请求头集合
         *
         * @param headers
         * @return
         */
        public HtmlUnitBuilder headers(Map<String, String> headers) {
            this.headers = headers;
            return this;
        }
    
        /**
         * 添加请求头参数
         *
         * @param key   键
         * @param value 值
         * @return
         */
        public HtmlUnitBuilder addHeader(String key, String value) {
            headers.put(key, value);
            return this;
        }
    
        public HtmlUnitBuilder waitForBackgroundJavaScript(int waitForBackgroundJavaScript) {
            this.waitForBackgroundJavaScript = waitForBackgroundJavaScript;
            return this;
        }
    
        public String url() {
            return url;
        }
    
        public int retry() {
            return retry;
        }
    
        public String proxyHost() {
            return proxyHost;
        }
    
        public int proxyPort() {
            return proxyPort;
        }
    
        public String username() {
            return username;
        }
    
        public String password() {
            return password;
        }
    
        public boolean enableCSS() {
            return enableCSS;
        }
    
        public boolean enableJS() {
            return enableJS;
        }
    
        public boolean enableAjax() {
            return enableAjax;
        }
    
        public boolean enableCookie() {
            return enableCookie;
        }
    
        public int waitForBackgroundJavaScript() {
            return waitForBackgroundJavaScript;
        }
    
        public Map<String, String> headers() {
            return headers;
        }
        public Map<String,String> cookies(){
            return cookies;
        }
    }
    
    
    
  • 相关阅读:
    C# 中的本地函数
    C# 9.0 正式发布了(C# 9.0 on the record)
    如何禁用控制台窗口的关闭按钮?
    在 WSL Ubuntu 上使用 .NET 进行跨平台开发新手入门
    C# 中 ConcurrentDictionary 一定线程安全吗?
    Docker 与 Podman 容器管理的比较
    C# 中的数字分隔符 _
    C# 8: 可变结构体中的只读实例成员
    C# 中的只读结构体(readonly struct)
    C# 8: 默认接口方法
  • 原文地址:https://www.cnblogs.com/lizijuna/p/11907390.html
Copyright © 2011-2022 走看看