zoukankan      html  css  js  c++  java
  • Webdriver配合Tesseract-OCR 自动识别简单的验证码


    验证码: 如下,在进行自动化测试,遇到验证码的问题,一般有两种方式 

    1.找开发去掉验证码或者使用万能验证码

    2.使用OCR自动识别


    使用OCR自动化识别,一般识别率不是太高,处理一般简单验证码还是没问题

    这里使用的是Tesseract-OCR,下载地址:https://github.com/A9T9/Free-Ocr-Windows-Desktop/releases

    怎么使用呢?

    进入安装后的目录:

    tesseract.exe test.png test -1 


    准备一份网页,上面使用该验证码

    <html>
    <head>
    <title>Table test by Young</title>
    </head>
    <body>
     </br>
    <h1> Test </h1>
     <img src="http://csujwc.its.csu.edu.cn/sys/ValidateCode.aspx?t=1">
     </br>
    </body>
    </html>

    要识别验证码,首先得取得验证码,这两款采取对 页面元素部分截图的方式,首先获取整个页面的截图

    然后找到页面元素坐标进行截取

    /**
         * This method for screen shot element
         * 
         * @param driver
         * @param element
         * @param path
         * @throws InterruptedException
         */
        public static void screenShotForElement(WebDriver driver,
                WebElement element, String path) throws InterruptedException {
            File scrFile = ((TakesScreenshot) driver)
                    .getScreenshotAs(OutputType.FILE);
            try {
                Point p = element.getLocation();
                int width = element.getSize().getWidth();
                int height = element.getSize().getHeight();
                Rectangle rect = new Rectangle(width, height);
                BufferedImage img = ImageIO.read(scrFile);
                BufferedImage dest = img.getSubimage(p.getX(), p.getY(),
                        rect.width, rect.height);
                ImageIO.write(dest, "png", scrFile);
                Thread.sleep(1000);
                FileUtils.copyFile(scrFile, new File(path));
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

    截取完元素,就可以调用Tesseract-OCR生成text

    // use Tesseract to get strings
            Runtime rt = Runtime.getRuntime();
            rt.exec("cmd.exe /C  tesseract.exe D:\Tesseract-OCR\test.png  D:\Tesseract-OCR\test -1 ");

    接下来通过java读取txt

    /**
         * This method for read TXT file
         * 
         * @param filePath
         */
        public static void readTextFile(String filePath) {
            try {
                String encoding = "GBK";
                File file = new File(filePath);
                if (file.isFile() && file.exists()) { // 判断文件是否存在
                    InputStreamReader read = new InputStreamReader(
                            new FileInputStream(file), encoding);// 考虑到编码格式
                    BufferedReader bufferedReader = new BufferedReader(read);
                    String lineTxt = null;
                    while ((lineTxt = bufferedReader.readLine()) != null) {
                        System.out.println(lineTxt);
                    }
                    read.close();
                } else {
                    System.out.println("找不到指定的文件");
                }
            } catch (Exception e) {
                System.out.println("读取文件内容出错");
                e.printStackTrace();
            }
        }

    整体代码如下:

      1 package com.dbyl.tests;
      2 
      3 import java.awt.Rectangle;
      4 import java.awt.image.BufferedImage;
      5 import java.io.BufferedReader;
      6 import java.io.File;
      7 import java.io.FileInputStream;
      8 import java.io.IOException;
      9 import java.io.InputStreamReader;
     10 import java.io.Reader;
     11 import java.util.concurrent.TimeUnit;
     12 
     13 import javax.imageio.ImageIO;
     14 
     15 import org.apache.commons.io.FileUtils;
     16 import org.openqa.selenium.By;
     17 import org.openqa.selenium.OutputType;
     18 import org.openqa.selenium.Point;
     19 import org.openqa.selenium.TakesScreenshot;
     20 import org.openqa.selenium.WebDriver;
     21 import org.openqa.selenium.WebElement;
     22 
     23 import com.dbyl.libarary.utils.DriverFactory;
     24 
     25 public class TesseractTest {
     26 
     27     public static void main(String[] args) throws IOException,
     28             InterruptedException {
     29 
     30         WebDriver driver = DriverFactory.getChromeDriver();
     31         driver.get("file:///C:/Users/validation.html");
     32         driver.manage().timeouts().pageLoadTimeout(30, TimeUnit.SECONDS);
     33         WebElement element = driver.findElement(By.xpath("//img"));
     34 
     35         // take screen shot for element
     36         screenShotForElement(driver, element, "D:\Tesseract-OCR\test.png");
     37 
     38         driver.quit();
     39         
     40         // use Tesseract to get strings
     41         Runtime rt = Runtime.getRuntime();
     42         rt.exec("cmd.exe /C  tesseract.exe D:\Tesseract-OCR\test.png  D:\Tesseract-OCR\test -1 ");
     43 
     44         Thread.sleep(1000);
     45         // Read text
     46         readTextFile("D:\Tesseract-OCR\test.txt");
     47     }
     48 
     49     /**
     50      * This method for read TXT file
     51      * 
     52      * @param filePath
     53      */
     54     public static void readTextFile(String filePath) {
     55         try {
     56             String encoding = "GBK";
     57             File file = new File(filePath);
     58             if (file.isFile() && file.exists()) { // 判断文件是否存在
     59                 InputStreamReader read = new InputStreamReader(
     60                         new FileInputStream(file), encoding);// 考虑到编码格式
     61                 BufferedReader bufferedReader = new BufferedReader(read);
     62                 String lineTxt = null;
     63                 while ((lineTxt = bufferedReader.readLine()) != null) {
     64                     System.out.println(lineTxt);
     65                 }
     66                 read.close();
     67             } else {
     68                 System.out.println("找不到指定的文件");
     69             }
     70         } catch (Exception e) {
     71             System.out.println("读取文件内容出错");
     72             e.printStackTrace();
     73         }
     74     }
     75 
     76     /**
     77      * This method for screen shot element
     78      * 
     79      * @param driver
     80      * @param element
     81      * @param path
     82      * @throws InterruptedException
     83      */
     84     public static void screenShotForElement(WebDriver driver,
     85             WebElement element, String path) throws InterruptedException {
     86         File scrFile = ((TakesScreenshot) driver)
     87                 .getScreenshotAs(OutputType.FILE);
     88         try {
     89             Point p = element.getLocation();
     90             int width = element.getSize().getWidth();
     91             int height = element.getSize().getHeight();
     92             Rectangle rect = new Rectangle(width, height);
     93             BufferedImage img = ImageIO.read(scrFile);
     94             BufferedImage dest = img.getSubimage(p.getX(), p.getY(),
     95                     rect.width, rect.height);
     96             ImageIO.write(dest, "png", scrFile);
     97             Thread.sleep(1000);
     98             FileUtils.copyFile(scrFile, new File(path));
     99         } catch (IOException e) {
    100             e.printStackTrace();
    101         }
    102     }
    103 
    104 }
    View Code
  • 相关阅读:
    python datetime,字符串,时间戳相互转换
    python在linux环境读取access数据库mdb文件
    ruby 随机字符串rand方法避坑
    gin 页面重定向
    go语言 goquery爬虫
    Rails项目防止时序攻击
    Authorization With Pundit
    Rails/ActiveRecord order by Array
    java线程池
    Java安全API
  • 原文地址:https://www.cnblogs.com/tobecrazy/p/4691045.html
Copyright © 2011-2022 走看看