zoukankan      html  css  js  c++  java
  • Java之Tesseract实现OCR

    1、实现逻辑

    package com.vue.demo.service.serviceimpl;
    
    import com.vue.demo.service.OCRService;
    import net.sourceforge.tess4j.Tesseract;
    import net.sourceforge.tess4j.TesseractException;
    import net.sourceforge.tess4j.util.ImageHelper;
    import org.apache.commons.io.FileUtils;
    import org.slf4j.Logger;
    import org.slf4j.LoggerFactory;
    import org.springframework.stereotype.Service;
    import org.springframework.web.multipart.MultipartFile;
    
    import javax.imageio.ImageIO;
    import java.awt.image.BufferedImage;
    import java.io.File;
    import java.io.IOException;
    
    /**
     * @author yangwj
     * @date 2020/4/1 9:29
     */
    @Service
    public class OCRServiceImpl implements OCRService {
        private  static  final Logger ocrServiceImplLog = LoggerFactory.getLogger(OCRServiceImpl.class);
        String language = "";
    
        /**
         * 方法一
         * @param file
         * @return
         */
        @Override
        public String getCharacterFromPic(MultipartFile file) {
    //        String modelPath = "D:\software\ocr-tesseract\tessdata";
            String modelPath = "/root/project/java/tesseract_model";
    
            Tesseract tessreact = new Tesseract();
            //需要指定训练集 训练集到 https://github.com/tesseract-ocr/tessdata 下载。
            tessreact.setDatapath(modelPath);
    
            if(language.equals("ch")) {
                //注意  默认是英文识别,如果做中文识别,需要单独设置。
                tessreact.setLanguage("chi_sim");
            }
            try {
                File imageFile = new File(file.getOriginalFilename());
                FileUtils.copyInputStreamToFile(file.getInputStream(), imageFile);
                String result = tessreact.doOCR(imageFile);
                ocrServiceImplLog.info(result);
                System.out.println("----------------");
                String handleResult  =  this.ocr(imageFile,modelPath);
                ocrServiceImplLog.info(handleResult);
                return result+"----------------------------------
    
    "+handleResult;
            } catch (TesseractException e) {
                System.err.println(e.getMessage());
            } catch (IOException e) {
                e.printStackTrace();
            }
            return null;
        }
    
        @Override
        public String getLanguage(String language) {
            if(language == null || language == "" ) {
                return null;
            }
            this.language = language;
            return "success";
        }
    
        /**
         * 方法二
         * @param file
         * @param modelPath
         * @return
         */
        private  String ocr(File file,String modelPath) {
            String result = null;
            try {
                double start = System.currentTimeMillis();
                BufferedImage textImage = ImageIO.read(file);
                // 这里对图片黑白处理,增强识别率.这里先通过截图,截取图片中需要识别的部分
                textImage = ImageHelper.convertImageToGrayscale(textImage);
                // 图片锐化
                textImage = ImageHelper.convertImageToBinary(textImage);
                // 图片放大倍数,增强识别率(很多图片本身无法识别,放大5倍时就可以轻易识,但是考滤到客户电脑配置低,针式打印机打印不连贯的问题,这里就放大5倍)
                textImage = ImageHelper.getScaledInstance(textImage, textImage.getWidth() * 1, textImage.getHeight() * 1);
    
                textImage = ImageHelper.convertImageToBinary(textImage);
                String saveImgPath = "/root/project/java/tesseract_model/temp_img";
    //            String saveImgPath = "D:\software\ocr-tesseract\img_tem\temp.img";
                ImageIO.write(textImage, "png", new File(saveImgPath));
    
                Tesseract instance = new Tesseract();
                //设置训练库的位置
    //            String modelPath = "/root/project/java/tesseract_model";
    
    
                instance.setDatapath(modelPath);
                //中文识别
                instance.setLanguage("chi_sim");
                result = instance.doOCR(textImage);
                double end = System.currentTimeMillis();
                System.out.println("耗时" + (end - start) / 1000 + " s");
            } catch (Exception e) {
                e.printStackTrace();
            }
            return result;
        }
    
    }

    2、部署到centos,遇到的问题,可以看这篇

  • 相关阅读:
    Live2d Test Env
    关于word2vec的一些问题
    排序链表
    最长回文子串
    前缀树
    验证回文串
    最大子序和/积
    构建知识图谱-初学
    HMM-维特比算法理解与实现(python)
    跨存储后台迁移数据的三种方案
  • 原文地址:https://www.cnblogs.com/ywjfx/p/12757461.html
Copyright © 2011-2022 走看看