zoukankan      html  css  js  c++  java
  • selenium处理极验滑动验证码

    要爬取一个网站遇到了极验的验证码,这周都在想着怎么破解这个,网上搜了好多知乎上看到有人问了这问题https://www.zhihu.com/question/28833985,我按照这思路去大概实现了一下。

    1.使用htmlunit(这种方式我没成功,模拟鼠标拖拽后轨迹没生成,可以跳过)

    我用的是java,我首先先想到了用直接用htmlunit,我做了点初始化

    private void initWebClient() {
            if (webClient != null) {
                return;
            }
            webClient = new WebClient(BrowserVersion.FIREFOX_24);
             webClient.getOptions().setProxyConfig(new ProxyConfig("127.0.0.1",8888));
            webClient.getOptions().setActiveXNative(true);
            webClient.getOptions().setUseInsecureSSL(true); // 配置证书
            webClient.getOptions().setJavaScriptEnabled(true);
            webClient.getOptions().setCssEnabled(true);
            webClient.setCssErrorHandler(new SilentCssErrorHandler());
            webClient.getOptions().setThrowExceptionOnScriptError(false);
            webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
            CookieManager cookieManager = new CookieManager();
            List<org.apache.http.cookie.Cookie> httpCookies = client.getCookies();//其方式获取的cookie
            for (org.apache.http.cookie.Cookie cookie : httpCookies) {
                cookieManager.addCookie(new com.gargoylesoftware.htmlunit.util.Cookie(cookie));
            }
            webClient.setCookieManager(cookieManager);
        }

    初始化代理,cookie..然后就能正常调用了

    HtmlPage page = webClient.getPage("http://www.qixin.com/login");//企信宝
    gePageInfor(page);

    下面就是我获取图片,还原图片并且模拟拖拽,(这里我觉得是有些问题的,可能是拖拽我模拟的不对导致触发的js并没有生成正确的轨迹,还请大家帮忙看看哪里错了)

    private void gePageInfor(HtmlPage page) {
            String[] img_slice={"div", "class", "gt_cut_fullbg_slice"};
            String[] img_bg_slice={"div", "class", "gt_cut_bg_slice"};
            HtmlDivision div = (HtmlDivision) page.getElementById("captcha");
            int deCAPTCHA = 0;
            try {
                byte[] img_slice_binary = client.get(getImgUrl(img_slice, div, true)).getBinary();//获取图片byte
                byte[] img_bg_slice_binary = client.get(getImgUrl(img_bg_slice, div, false)).getBinary();
                //获取还原后的图片
                BufferedImage geetestImg = ImgTest.getGeetestImg(img_slice_binary, ImgTest.imgArray);
                BufferedImage geetestImg2 = ImgTest.getGeetestImg(img_bg_slice_binary, ImgTest.imgArray);
                //获得图片移动位置(目前还有问题,需改用第三方图片识别)
                deCAPTCHA =ImgTest.deCAPTCHA(geetestImg,geetestImg2);
                System.out.println(deCAPTCHA);
            } catch (IOException | FetchException e) {
                e.printStackTrace();
            }
            HtmlDivision div_slider_knob = get_div_slider_knob(page,"gt_slider_knob gt_show");//获取要移动div
            HtmlPage mouseOver = (HtmlPage) div_slider_knob.mouseOver();
            HtmlPage mouseDownPage = (HtmlPage)div_slider_knob.mouseDown();
            div_slider_knob = get_div_slider_knob(mouseDownPage,"gt_slider_knob gt_show moving");
            mouseMoveX(deCAPTCHA, div_slider_knob, mouseDownPage);
            HtmlPage newPage =(HtmlPage)div_slider_knob.mouseOver();
    //        newPage =(HtmlPage)div_slider_knob.mouseDown();
            System.out.println(newPage.asXml());
            div = (HtmlDivision)newPage.getElementById("captcha");
            HtmlElement htmlElement = div.getElementsByAttribute("div", "class", "gt_slice gt_show moving").get(0);
            System.out.println(htmlElement);
            newPage =(HtmlPage)div_slider_knob.mouseUp();//触发js,轨迹没有生成
            System.out.println("---------------");
            System.out.println(newPage.asXml());        
            if (newPage.getElementById("captcha")!=null) {//错误重试
                //gePageInfor(newPage);
            }
        }
    
        private void mouseMoveX(int deCAPTCHA, HtmlDivision div_slider_knob, HtmlPage mouseDown) {
            MouseEvent mouseEvent = new MouseEvent(div_slider_knob, MouseEvent.TYPE_MOUSE_MOVE, false, false, false, MouseEvent.BUTTON_LEFT);
            mouseEvent.setClientX( mouseEvent.getClientX()+((deCAPTCHA!=0)?deCAPTCHA:99));    //移动x坐标
            ScriptResult scriptResult = mouseDown.getDocumentElement().fireEvent(mouseEvent);
        }
        private HtmlDivision get_div_slider_knob(HtmlPage page,String classString) {
            return (HtmlDivision)(((HtmlDivision) page.getElementById("captcha")).getElementsByAttribute("div", "class", classString).get(0));
        }
    
        private String getImgUrl(String[] img_slice, HtmlDivision div, boolean isNeedCheckPostion) {
            String url ="";
            int[] postion = new int[2];
            boolean empty = div.getElementsByAttribute(img_slice[0],img_slice[1],img_slice[2]).isEmpty();
            if (div.hasChildNodes() && !empty) {
                List<HtmlElement> elementsByAttribute = div.getElementsByAttribute(img_slice[0],img_slice[1],img_slice[2]);    
                for(int i = 0;i<elementsByAttribute.size();i++){
                    HtmlDivision div_img = (HtmlDivision)elementsByAttribute.get(i);
                    String style = div_img.getAttribute("style");
                    String[] imge_url_position = style.split(";");
                    if(StringUtils.isBlank(url)){//确认url
                        url = StringUtils.replacePattern(imge_url_position[0], ".*\(", "").replace(")", "");
                    }
                    if (isNeedCheckPostion) {//确认图片切割postion,两张图切割方式一样  background-position: -157px -58px
    //                    String[] positionS = StringUtils.split(StringUtils.remove(imge_url_position[1], "px").replace("-", "").replaceAll(".*:", ""), null);
                        String[] positionS = StringUtils.split(StringUtils.removePattern(imge_url_position[1], "[^\d+ \s]"),null);
                        postion[0] = Integer.parseInt(positionS[0]);
                        postion[1] = Integer.parseInt(positionS[1]);
                        int[] is = ImgTest.imgArray[i];
                        if (is[0]!=postion[0]||is[1]!=postion[1]) {
                            logger.debug("更新分割postion");
                            ImgTest.imgArray[i] = postion;
                        }
                        System.out.println(ImgTest.imgArray);
                        isNeedCheckPostion= false;
                    }
                }
            }
            return url;
        }

    对比图片获取位移方法(deCAPTCHA)是错的我就不放代码了,下面是其中还原图片用的方法,目前是其实审查元素后你就明白怎么还原这个图片了,这里是每次读的10px,58px

    public static BufferedImage getGeetestImg(byte[] binary, int[][] imgArray) throws IOException {
            BufferedImage img = ImageIO.read(new ByteArrayInputStream(binary));
            List<BufferedImage> list = new ArrayList<>();
            for (int i=0;i< imgArray.length;i++) {    
                BufferedImage subimage = img.getSubimage(imgArray[i][0], imgArray[i][1], 10, 58);
                list.add(subimage);
    //            ImageIO.write(subimage, "jpg", new File("d:\image\imgs"+i+".jpg"));
            }
            BufferedImage mergeImageUp = null;
            BufferedImage mergeImageDown = null;
            int mid = list.size()>>>1;
            for (int i = 0; i <mid-1 ; i++) {
                mergeImageUp =  mergeImage(mergeImageUp==null?list.get(i):mergeImageUp, list.get(i+1), true);                
            }
            for(int i = mid;i<list.size()-1;i++){
                mergeImageDown = mergeImage(mergeImageDown==null?list.get(i):mergeImageDown,list.get(i+1), true);
            }
            img = mergeImage(mergeImageUp, mergeImageDown, false);
            return img;
        }
         public static BufferedImage mergeImage(BufferedImage img1,
                    BufferedImage img2, boolean isHorizontal) throws IOException {
                int w1 = img1.getWidth();
                int h1 = img1.getHeight();
                int w2 = img2.getWidth();
                int h2 = img2.getHeight();
                // 从图片中读取RGB
                int[] ImageArrayOne = new int[w1 * h1];
                ImageArrayOne = img1.getRGB(0, 0, w1, h1, ImageArrayOne, 0, w1); // 逐行扫描图像中各个像素的RGB到数组中
                int[] ImageArrayTwo = new int[w2 * h2];
                ImageArrayTwo = img2.getRGB(0, 0, w2, h2, ImageArrayTwo, 0, w2);
    
                // 生成新图片
                BufferedImage DestImage = null;
                if (isHorizontal) { // 水平方向合并
                    DestImage = new BufferedImage(w1+w2, h1, BufferedImage.TYPE_INT_RGB);
                    DestImage.setRGB(0, 0, w1, h1, ImageArrayOne, 0, w1); // 设置上半部分或左半部分的RGB
                    DestImage.setRGB(w1, 0, w2, h2, ImageArrayTwo, 0, w2);
                } else { // 垂直方向合并
                    DestImage = new BufferedImage(w1, h1 + h2,
                            BufferedImage.TYPE_INT_RGB);
                    DestImage.setRGB(0, 0, w1, h1, ImageArrayOne, 0, w1); // 设置上半部分或左半部分的RGB
                    DestImage.setRGB(0, h1, w2, h2, ImageArrayTwo, 0, w2); // 设置下半部分的RGB
                }
    
                return DestImage;
            }
        

    2.使用selenium

    后来我想着是我模拟鼠标这个动作哪里有问题,我就又找到了selenium(2.42.2),他也能操作htmlunit关键他的鼠标动作好像封装比较完全

    但是我尝试了以后发现了这个,HtmlUnitMouse这个动作没有实现

     public void mouseMove(Coordinates where, long xOffset, long yOffset) {
        throw new UnsupportedOperationException("Moving to arbitrary X,Y coordinates not supported.");
      }

    好吧,于是调用chrome吧

    System.setProperty("webdriver.chrome.driver","C:\chromedriver.exe");
            Proxy proxy = new Proxy();  
            //设置代理服务器地址  
            proxy.setHttpProxy("127.0.0.1:8888");  
    //        DesiredCapabilities capabilities = DesiredCapabilities.htmlUnitWithJs();
            DesiredCapabilities capabilities = DesiredCapabilities.chrome();  
            capabilities.setCapability(CapabilityType.PROXY, proxy);
    //        final WebDriver driver = new HtmlUnitDriver(capabilities);      
            WebDriver driver = new ChromeDriver(capabilities);
            driver.get("http://www.qixin.com/login");
            driver.manage().timeouts().implicitlyWait(10, TimeUnit.SECONDS);
             checkPage(driver,"return $('.gt_cut_fullbg_slice');");
            // 获取 网页的 title
            System.out.println("1 Page title is: " + driver.getTitle());
            // 通过 id 找到 input 的 DOM
            String pageSource = driver.getPageSource();
            System.out.println(pageSource);
            org.openqa.selenium.JavascriptExecutor executor = (org.openqa.selenium.JavascriptExecutor)driver;
            boolean equals = executor.executeScript("return document.readyState").equals("complete");
            int moveX =99;//移动位置
            if (equals) {
                WebElement element = driver.findElement(By.className("gt_slider_knob"));//(".gt_slider_knob"));
                Point location = element.getLocation();
                element.getSize();
                Actions action = new Actions(driver); 
                //             action.clickAndHold().perform();// 鼠标在当前位置点击后不释放
    //             action.clickAndHold(element).perform();// 鼠标在 onElement 元素的位置点击后不释放
    //             action.clickAndHold(element).moveByOffset(location.x+99,location.y).release().perform(); //选中source元素->拖放到(xOffset,yOffset)位置->释放左键
                 action.dragAndDropBy(element, location.x+moveX,location.y).perform();
    //            action.dragAndDrop(element,newelement).perform();
                pageSource = driver.getPageSource();
            }
            //更新cookie
            Set<org.openqa.selenium.Cookie> cookies = driver.manage().getCookies();
            Set<Cookie> cookies2 = new HashSet<>();
            for (org.openqa.selenium.Cookie cookie : cookies) {
                cookies2.add((Cookie) new Cookie(cookie.getDomain(), cookie.getName(), cookie.getValue(), cookie.getPath(), cookie.getExpiry(), true));
            }
            for (Cookie cookie : cookies2) {
                org.apache.http.cookie.Cookie httpClient = cookie.toHttpClient();
            }
            System.out.println(pageSource);

    这样提交的表单确实是有轨迹的,这里移动位置我先写了个固定值,可以由上面图片还原,以及一些开源的图片识别工具识别出位置。以上应该就能解决这个滑动验证码了

  • 相关阅读:
    git命令
    基于babel实现react核心功能(初始化,fiber,hook)
    Vue组件化原理-Xmind版
    访问后台 出现 俩次请求拼接情况 例如 https://localhost:4431/api/auth/jwt/token+https://localhost:4431/api/auth/jwt/token
    spring mvc 拦截器和过滤器
    前后端分离,session登录实例,jquery版本必须大于1.5,否则withCredentials不起作用
    kafka batches 数据结构是自定义map
    数据库blob中文乱码,如何查看
    先更新数据库 后删缓存
    高老师好
  • 原文地址:https://www.cnblogs.com/wangly/p/5630069.html
Copyright © 2011-2022 走看看