zoukankan      html  css  js  c++  java
  • Java WebClient 总结

    private WebClient getAWebClient() {
            WebClient webClient = new WebClient(BrowserVersion.FIREFOX_24);
            webClient.getOptions().setTimeout(20000);
            // webClient.getCookieManager().setCookiesEnabled(true);
            webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
            webClient.getOptions().setThrowExceptionOnScriptError(false);
            webClient.getOptions().setCssEnabled(false);
            webClient.getOptions().setJavaScriptEnabled(false);
            webClient.addRequestHeader("Accept", "textml,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
            webClient.addRequestHeader("Accept-Encoding", "gzip, deflate");
            webClient.addRequestHeader("Accept-Language", "en-US,en;q=0.5");
            webClient.addRequestHeader("Cache-Control", "max-age=0");
            webClient.addRequestHeader("Connection", "keep-alive");
            webClient.addRequestHeader("Host", "www.amazon.com");
            webClient.addRequestHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0");
            return webClient;
        }
    /**
         * 采集网页
         */
        public StringBuilder crawlPage(String url) {
            StringBuilder builder = new StringBuilder();
            logger.info(Thread.currentThread().getName() + " crawl " + url);
            // mygetpage代码放在这里
            webClient.getCookieManager().clearCookies();
            logger.info(Thread.currentThread().getName() + " webClient.getCookieManager().clearCookies();");
            File file = new File(cookiePathAppendRandom());
            logger.info(Thread.currentThread().getName() + " File file = new File(cookiePathAppendRandom());");
            if (file.exists()) {
                FileInputStream fin = null;
                try {
                    fin = new FileInputStream(file);
                } catch (FileNotFoundException e1) {
                    e1.printStackTrace();
                }
                CookieStore cookieStore = null;
                ObjectInputStream in;
                try {
                    in = new ObjectInputStream(fin);
                    cookieStore = (CookieStore) in.readObject();
                    in.close();
                } catch (IOException e) {
                    logger.error(e);
                } catch (ClassNotFoundException e) {
                    logger.error(e);
                }
                List<org.apache.http.cookie.Cookie> l = cookieStore.getCookies();
                for (org.apache.http.cookie.Cookie temp : l) {
                    Cookie cookie = new Cookie(temp.getDomain(), temp.getName(), temp.getValue(), temp.getPath(),
                            temp.getExpiryDate(), false);
                    webClient.getCookieManager().addCookie(cookie);
                }
            }
            logger.info(Thread.currentThread().getName() + " MyGetPage start,url:" + url);
            HtmlPage page = MyGetPage(new StringBuffer(url));
            logger.info(Thread.currentThread().getName() + " MyGetPage end,url:" + url);
            if (page == null) {
                // 采集过程中出现异常的model,可以统一放在一个list中,发送给server重新加入到采集分配队列
                logger.info("Page null!");
                AmazonCrawlModel model=new AmazonCrawlModel(crawlId, crawlURLId, url, depth,ischange);
                exceptionFun(model);
                return (new StringBuilder("getNullPage"));
            }
            logger.info(Thread.currentThread().getName() + " builder.append(page.asXml());");
            builder.append(page.asXml());
            logger.info(Thread.currentThread().getName() + " return builder;");
            logger.info(Thread.currentThread().getName() +" CrawlPage $Length="+builder.toString().length());
            if(builder.toString().length()<=300){
                AmazonCrawlModel model=new AmazonCrawlModel(crawlId, crawlURLId, url, depth,ischange);
                exceptionFun(model);
                return (new StringBuilder("getNullPage"));
            }
            return builder;
        }
    /***
         * 自定义的getpage,遇到验证码页面识别直至成功
         * 
         */
        private HtmlPage MyGetPage(StringBuffer URL) {
            HtmlPage page = null;
            boolean flag = true;
            int TryTimeCnt = 1;
            int UnknowHostTryTimeCnt = 1;
            while (flag) {
                flag = false;
                try {
                    logger.info(Thread.currentThread().getName() + " webClient.getPage : " + URL + ",CrawlURL_id:"
                            + crawlURLId);
                    page = webClient.getPage(URL.toString());
                    Document doc = Jsoup.parse(page.asXml());
                    int robotchecknum = 1;
                    while (doc.select("title").text().equals("Robot Check")) {
                        logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())
                                + " [Robot Check,URL:" + URL + "]");
                        String captcha_str = AmazonGetCaptcha.GetCaptcha(new StringBuilder(doc.toString()));
                        logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())
                                + " end AmazonGetCaptcha.GetCaptcha");
                        logger.info(dayformat1.format(new Date()) + " " + Thread.currentThread().getName() + " : "
                                + captcha_str);
    
                        HtmlForm form = null;
    
                        logger.info(Thread.currentThread().getName() + " page.getForms().get(0) Start");
                        form = page.getForms().get(0);
                        logger.info(Thread.currentThread().getName() + " page.getForms().get(0) End");
    
                        HtmlButton button = null;
    
                        logger.info(Thread.currentThread().getName() + " form.getElementsByTagName(button).get(0) Start");
                        button = (HtmlButton) form.getElementsByTagName("button").get(0);
                        logger.info(Thread.currentThread().getName() + " form.getElementsByTagName(button).get(0) End");
    
                        logger.info(Thread.currentThread().getName() + " setValueAttribute Start");
                        form.getInputByName("field-keywords").setValueAttribute(captcha_str);
                        logger.info(Thread.currentThread().getName() + " setValueAttribute End");
    
                        logger.info(Thread.currentThread().getName() + " button.click Start");
                        boolean click_flag = false;
                        while (!click_flag) {
                            try {
                                click_flag = true;
                                page = button.click();
                            } catch (Exception e1) {
                                logger.error(Thread.currentThread().getName() + " button.click出错了: " + e1);
                                //e1.printStackTrace();
                                click_flag = false;
                            }
                        }
                        logger.info(Thread.currentThread().getName() + " button.click end");
                        while (page.asXml() == null) {
                            logger.info(Thread.currentThread().getName() + " page xml null");
                            logger.info(Thread.currentThread().getName() +" "+ page.asXml());
                            page.refresh();
                            logger.info(Thread.currentThread().getName() + " refresh End!");
                        }
                        logger.info(Thread.currentThread().getName() + " button.click End");
    
                        logger.info(Thread.currentThread().getName() + " Start ParsePage!");
                        doc = Jsoup.parse(page.asXml());
                        if (!doc.select("title").text().equals("Robot Check")) {
                            logger.info(Thread.currentThread().getName() + " " + doc.select("title").text());
                            logger.info(Thread.currentThread().getName() + " "
                                    + dayformat1.format(System.currentTimeMillis()) + " [Robot Check,captcha success:"
                                    + captcha_str + ",try num:" + robotchecknum + "]");
                        }
                        robotchecknum++;
                    }
    
                } catch (FailingHttpStatusCodeException e) {
                    logger.error(Thread.currentThread().getName() +" "+ e);
                    flag = true;
                } catch (MalformedURLException e) {
                    logger.error(Thread.currentThread().getName() +" "+ e);
                    flag = true;
                }catch(UnknownHostException e) {
                    logger.error(Thread.currentThread().getName() +" "+ e);
                    flag = true;
                    logger.info("found UnknownHostException,start sleep 20 min");
                    try {
                        Thread.sleep(1000*60*Integer.parseInt(Configuration.getProperties("unknowhost_sleeptime")));
                    } catch (InterruptedException e1) {
                        logger.error(Thread.currentThread().getName() +" "+ e1);
                    }
                    logger.info("found UnknownHostException,end sleep 20 min");
                    UnknowHostTryTimeCnt++;// 访问异常数加一
                    logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())
                            + " [UnknowHostTryTimeCnt:" + UnknowHostTryTimeCnt + "]");
                    if (UnknowHostTryTimeCnt > Integer.parseInt(Configuration.getProperties("unknowhost_maxtrytime"))) {
                        return null;
                    }
                }catch (Exception eq) {
                    logger.error(Thread.currentThread().getName() + " "+eq);
                    TryTimeCnt++;// 访问异常数加一
                    logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())
                            + " [TryTimeCnt:" + TryTimeCnt + "]");
                    if (TryTimeCnt > 5) {
                        return null;
                    }
                    try {
                        Thread.sleep(1000);
                    } catch (InterruptedException e) {
                        e.printStackTrace();
                        logger.error(Thread.currentThread().getName() + e);
                    }
                    flag = true;
                }
                try {
                    Thread.sleep(random.nextInt(500) + 1500);
                } catch (InterruptedException e) {
                    logger.error(Thread.currentThread().getName() + e);
                    flag = true;
                }
            }
            return page;
        }
  • 相关阅读:
    MVC5+EF6简单实例以原有SQLServer数据库两表联合查询为例
    ArcGIS生成根据点图层生成等值面并减小栅格锯齿的操作步骤
    EF6+MVC5之Oracleo数据库的Code First方式实现
    jquery写的tab切换效果 非常简单
    APP消息推送是否进入消息中心和click、receive事件分析
    Android APP切换到后台接收不到推送消息
    js写的简单轮播图
    凉凉的笔记 two day
    凉凉的笔记 one.day
    在SQL Server数据库中批量导入数据的四种方法
  • 原文地址:https://www.cnblogs.com/zeze/p/6043300.html
Copyright © 2011-2022 走看看