这两天研究了一下关于OCR图文解析的技术。当然市场上已经有开源服务,比如百度的AI开放平台,就有OCR相关的API接口。我这里选用的是Tesseract开源框架,java封装版本是tess4j。结合网上公布的一些开源项目提供的demo,完成了身份证与营业执照的相关文字识别的处理。总体上来讲Tesseract其实还不错,简单应用其实还挺简单的(提供的图片质量可以靠前端做好限制,比如身份证识别,加上头像或国徽的框图限定,能提高识别率)。
示例项目地址:https://github.com/git-simm/simm-framework
一、技术介绍
Tesseract:开源的OCR识别引擎,初期Tesseract引擎由HP实验室研发,后来贡献给了开源软件业,后由Google进行改进、修改bug、优化,重新发布。
1、直接识别支持的文件
2、识别图片流
3、识别图片的某块区域
4、将识别结果保存为 TEXT/ HOCR/ PDF/ UNLV/ BOX
5、通过设置取词的等级,提取识别出来的文字
6、获得每一个识别区域的具体坐标范围
7、调整倾斜的图片
8、裁剪图片
9、调整图片分辨率
10、从粘贴板获得图像
11、克隆一个图像(目的:创建一份一模一样的图片,与原图在操作修改上,不相 互影响)
12、图片转换为二进制、黑白图像、灰度图像
13、反转图片颜色
二、环境准备(https://www.jianshu.com/p/ef60ef5395c5)
2.1、我们需要安装tessdata语言包,用于图文识别。 tesseract-ocr语言包的下载地址,用于识别文字时进行匹配。链接: https://pan.baidu.com/s/1XAvPkTdUXuFq-q2InDREhQ 提取码: 6vjp
2.2、项目引入maven依赖
<dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
<version>4.5.2</version>
</dependency>
官方文档:Tess4J Usage
2.3 Linux 环境
sudo yum -y install tesseract ## 安装so和英文tessdata sudo yum -y install tesseract-langpack-chi_sim.noarch ## 安装简体中文 sudo yum -y install tesseract-langpack-chi_tra.noarch ## 安装繁体中文 sudo ls -l /usr/share/tesseract/tessdata/*.traineddata ## tessdata 目录
Windows 版本的 Tesseract 本机库是用 VS2013(或者VS2012/VS2015) 构建的,所以必须安装 Microsoft Visual C++ 20XX Redistributable。
已经在 WinXP/Win7/Win10 上验证通过:安装 Visual C++ Redistributable for VS2013
32 位版本。
三、简单描述下图文解析的过程
四、服务端关键代码的展示
private static int targetBrightness = 260; private static int targetDifferenceValue = 15; /** * 解析身份证信息 * * @param inputStream * @return * @throws Exception */ @Override public BizLicenseInfo getInfo(InputStream inputStream) throws Exception { BizLicenseInfo bizLicenseInfo = new BizLicenseInfo(); String rootPath = ClassUtils.getDefaultClassLoader().getResource("").getPath()+"/tmp"; Tesseract tesseract = new Tesseract(); tesseract.setLanguage("chi_sim"); //读取网络图片 BufferedImage bufferedImage = ImageFilter.cloneImage(ImageIO.read(inputStream)); //不过滤部分颜色 //bufferedImage = ImageFilter.imageRGBDifferenceFilter(bufferedImage, targetDifferenceValue, null); bufferedImage = ImageFilter.convertImageToGrayScale(bufferedImage); //缩放到真实身份证大小 bufferedImage = ImageFilter.imageScale(bufferedImage, 3150, 1920); try (OutputStream outputStream = new FileOutputStream(rootPath+"/bg.jpg")) { saveImg(bufferedImage,outputStream); getBufferedNameImage(tesseract, bufferedImage, bizLicenseInfo,rootPath+"/nameImageBefore.jpg"); getBufferedCapitalImage(tesseract, bufferedImage, bizLicenseInfo,rootPath+"/capitalImageBefore.jpg"); getBufferedBizTypeImage(tesseract, bufferedImage, bizLicenseInfo,rootPath+"/bizTypeImageBefore.jpg"); getBufferedBuildOnImage(tesseract, bufferedImage, bizLicenseInfo,rootPath+"/buildOnImageBefore.jpg"); getBufferedJuridicalImage(tesseract, bufferedImage, bizLicenseInfo,rootPath+"/juridicalImageBefore.jpg"); getBufferedBizLimitImage(tesseract, bufferedImage, bizLicenseInfo,rootPath+"/bizLimitImageBefore.jpg"); getBufferedBizScopeImage(tesseract, bufferedImage, bizLicenseInfo,rootPath+"/bizScopeImageBefore.jpg"); getBufferedAddressImage(tesseract, bufferedImage, bizLicenseInfo,rootPath+"/addressImageBefore.jpg"); getBufferedCreditCodeImage(tesseract, bufferedImage, bizLicenseInfo,rootPath+"/creditCodeImageBefore.jpg"); return bizLicenseInfo; }catch (Exception e){ e.printStackTrace(); throw e; } }
/** * 获取统一社会信用代码 * @param tesseract * @param bufferedImage * @param bizLicenseInfo * @param path * @throws IOException * @throws TesseractException */ private void getBufferedCreditCodeImage(Tesseract tesseract, BufferedImage bufferedImage, BizLicenseInfo bizLicenseInfo, String path) throws IOException, TesseractException { try (OutputStream outputStream = new FileOutputStream(path)) { BufferedImage idImage = ImageFilter.subImage(bufferedImage, bufferedImage.getMinX() + 200 , 250, 550, 300); System.out.println("creditCodeImage 辉度处理"); handBrightness(idImage, targetBrightness); saveImg(idImage, outputStream); // tesseract.setLanguage("eng"); tesseract.setLanguage("chi_sim"); // W 可以配置 非字母和数字,等价于 [^a-zA-Z0-9] (d D 小写表示匹配数字,大写表示匹配非数字) String idCardNumber = tesseract.doOCR(idImage).replaceAll("[\W]", ""); bizLicenseInfo.setCreditCode(idCardNumber); }catch (Exception e){ e.printStackTrace(); throw e; } } /** * 获取名称 * @param tesseract * @param bufferedImage * @param bizLicenseInfo * @param path */ private void getBufferedNameImage(Tesseract tesseract, BufferedImage bufferedImage, BizLicenseInfo bizLicenseInfo, String path) throws IOException, TesseractException { BufferedImage buffered = ImageFilter.subImage(bufferedImage, 520, 700, 1200, 120); getBufferedImage(tesseract,buffered,path,(img,content)->{ System.out.println("setName 辉度处理"); bizLicenseInfo.setName(content); }); } /** * 获取类型 * @param tesseract * @param bufferedImage * @param bizLicenseInfo * @param path * @throws IOException * @throws TesseractException */ private void getBufferedBizTypeImage(Tesseract tesseract, BufferedImage bufferedImage, BizLicenseInfo bizLicenseInfo, String path) throws IOException, TesseractException { BufferedImage buffered = ImageFilter.subImage(bufferedImage, 520, 820, 1200, 130); getBufferedImage(tesseract,buffered,path,(img,content)->{ System.out.println("setBizType 辉度处理"); bizLicenseInfo.setBizType(content); }); } /** * 获取法人信息 * @param tesseract * @param bufferedImage * @param bizLicenseInfo * @param path * @throws IOException * @throws TesseractException */ private void getBufferedJuridicalImage(Tesseract tesseract, BufferedImage bufferedImage, BizLicenseInfo bizLicenseInfo, String path) throws IOException, TesseractException { BufferedImage buffered = ImageFilter.subImage(bufferedImage, 520, 950, 1200, 120); getBufferedImage(tesseract,buffered,path,(img,content)->{ System.out.println("setJuridical 辉度处理"); bizLicenseInfo.setJuridical(content); }); } /** * 获取经营范围 * @param tesseract * @param bufferedImage * @param bizLicenseInfo * @param path * @throws IOException * @throws TesseractException */ private void getBufferedBizScopeImage(Tesseract tesseract, BufferedImage bufferedImage, BizLicenseInfo bizLicenseInfo, String path) throws IOException, TesseractException { BufferedImage buffered = ImageFilter.subImage(bufferedImage, 520, 1070, 1330, bufferedImage.getHeight() - 1200); getBufferedImage(tesseract,buffered,path,(img,content)->{ System.out.println("setBizScope 辉度处理"); bizLicenseInfo.setBizScope(content); }); } /** * 获取注册资本 * @param tesseract * @param bufferedImage * @param bizLicenseInfo * @param path * @throws IOException * @throws TesseractException */ private void getBufferedCapitalImage(Tesseract tesseract, BufferedImage bufferedImage, BizLicenseInfo bizLicenseInfo, String path) throws IOException, TesseractException { BufferedImage buffered = ImageFilter.subImage(bufferedImage, 2170, 720, bufferedImage.getWidth()-2400, 120); getBufferedImage(tesseract,buffered,path,(img,content)->{ System.out.println("setCapital 辉度处理"); bizLicenseInfo.setCapital(content); }); } /** * 获取成立日期 * @param tesseract * @param bufferedImage * @param bizLicenseInfo * @param path * @throws IOException * @throws TesseractException */ private void getBufferedBuildOnImage(Tesseract tesseract, BufferedImage bufferedImage, BizLicenseInfo bizLicenseInfo, String path) throws IOException, TesseractException { BufferedImage buffered = ImageFilter.subImage(bufferedImage, 2170, 850, bufferedImage.getWidth()-2400, 100); getBufferedImage(tesseract,buffered,path,(img,content)->{ System.out.println("setBuildOn 辉度处理"); bizLicenseInfo.setBuildOn(content); }); } /** * 获取营业期限 * @param tesseract * @param bufferedImage * @param bizLicenseInfo * @param path * @throws IOException * @throws TesseractException */ private void getBufferedBizLimitImage(Tesseract tesseract, BufferedImage bufferedImage, BizLicenseInfo bizLicenseInfo, String path) throws IOException, TesseractException { BufferedImage buffered = ImageFilter.subImage(bufferedImage, 2170, 970, bufferedImage.getWidth()-2400, 100); getBufferedImage(tesseract,buffered,path,(img,content)->{ System.out.println("setBizLimit 辉度处理"); bizLicenseInfo.setBizLimit(content); }); } /** * 获取住所 * @param tesseract * @param bufferedImage * @param bizLicenseInfo * @param path * @throws IOException * @throws TesseractException */ private void getBufferedAddressImage(Tesseract tesseract, BufferedImage bufferedImage, BizLicenseInfo bizLicenseInfo, String path) throws IOException, TesseractException { BufferedImage buffered = ImageFilter.subImage(bufferedImage, 2170, 1070, bufferedImage.getWidth()-2240, 270); getBufferedImage(tesseract,buffered,path,(img,content)->{ System.out.println("setAddress 辉度处理"); bizLicenseInfo.setAddress(content); }); } /** * 获取名称 * @param tesseract * @param buffered * @param path * @param consumer */ private void getBufferedImage(Tesseract tesseract, BufferedImage buffered, String path, BiConsumer<BufferedImage,String> consumer) throws IOException, TesseractException { try (OutputStream outputStream = new FileOutputStream(path)) { // addressImage = ImageFilter.imageScale(addressImage, ((int) (addressImage.getWidth() * 2.4) + 1), ((int) (addressImage.getHeight() * 2.4) + 1)); handBrightness(buffered, targetBrightness); saveImg(buffered, outputStream); tesseract.setLanguage("chi_sim"); String result = tesseract.doOCR(buffered); //留下中文字符、中文标点符号()【】、 String regexStr = "[^\s\u4e00-\u9fa5\(\)\uff08\uff09\u3001\u3010\u3011\-0-9]+"; String content = result.replaceAll(regexStr, "") .replaceAll("\n", "") .replaceAll(" ", ""); if(consumer!=null){ consumer.accept(buffered,content); } }catch (Exception e){ e.printStackTrace(); throw e; } } /** * 保存图片 * @param image * @param outputStream * @throws IOException */ private void saveImg(BufferedImage image,OutputStream outputStream) throws IOException { ImageIO.write(image, "jpg", outputStream); } /** * 处理图片辉度 * * @param subImage */ private void handBrightness(BufferedImage subImage, int targetBrightness) { int fixedBrightness; int birthBrightness = ImageFilter.imageBrightness(subImage); System.out.println("brightness = " + birthBrightness); fixedBrightness = targetBrightness - birthBrightness; //辉度处理 if (fixedBrightness != 0) { subImage = ImageFilter.imageBrightness(subImage, fixedBrightness); } System.out.println("after brightness = " + ImageFilter.imageBrightness(subImage)); }
五、解析效果展示
5.1、身份证信息识别示例:
5.2、营业执照信息识别示例:
参考资料:
https://github.com/firefoxmmx2/IDCardIDentify