zoukankan      html  css  js  c++  java
  • tesseract-ocr 识别中文扫描图片

    原文链接:http://www.cnblogs.com/alex-blog/articles/2714984.html

     

    项目主页地址:http://code.google.com/p/tesseract-ocr/

    相关资源下载地址:http://code.google.com/p/tesseract-ocr/downloads/list

    需要下载的资源有:

    1、tesseract-ocr-setup-3.01-1.exe

    因我本地为windows系统,所以用这个

    2、chi_sim.traineddata.gz

    中文识别时需要的。

    安装tesseract-ocr

    自定义安装语言包

    在Tesseract-OCR安装目录下找到 tessdata目录,其是用来存放语言包,可把 chi_sim.traineddata.gz 解压缩之后的chi_sim.traineddata文件复制到该目录下即可。

    本文使用参考blog中的例子

    如下:

    package org.img;
    import java.awt.image.BufferedImage;
    import java.io.File;
    import java.io.IOException;
    import java.util.Iterator;
    import java.util.Locale;
    
    import javax.imageio.IIOImage;
    import javax.imageio.ImageIO;
    import javax.imageio.ImageReader;
    import javax.imageio.ImageWriteParam;
    import javax.imageio.ImageWriter;
    import javax.imageio.metadata.IIOMetadata;
    import javax.imageio.stream.ImageInputStream;
    import javax.imageio.stream.ImageOutputStream;
    
    import com.sun.media.imageio.plugins.tiff.TIFFImageWriteParam;
    
    public class ImageIOHelper {
        /**
         * 图片文件转换为tif格式
         * @param imageFile 文件路径
         * @param imageFormat 文件扩展名
         * @return
         */
        public static File createImage(File imageFile, String imageFormat) {
            File tempFile = null;
            try {
                Iterator<ImageReader> readers = ImageIO.getImageReadersByFormatName(imageFormat);
                ImageReader reader = readers.next();
            
                ImageInputStream iis = ImageIO.createImageInputStream(imageFile);
                reader.setInput(iis);
                //Read the stream metadata
                IIOMetadata streamMetadata = reader.getStreamMetadata();
                
                //Set up the writeParam
                TIFFImageWriteParam tiffWriteParam = new TIFFImageWriteParam(Locale.CHINESE);
                tiffWriteParam.setCompressionMode(ImageWriteParam.MODE_DISABLED);
                
                //Get tif writer and set output to file
                Iterator<ImageWriter> writers = ImageIO.getImageWritersByFormatName("tiff");
                ImageWriter writer = writers.next();
                
                BufferedImage bi = reader.read(0);
                IIOImage image = new IIOImage(bi,null,reader.getImageMetadata(0));
                tempFile = tempImageFile(imageFile);
                ImageOutputStream ios = ImageIO.createImageOutputStream(tempFile);
                writer.setOutput(ios);
                writer.write(streamMetadata, image, tiffWriteParam);
                ios.close();
                
                writer.dispose();
                reader.dispose();
                
            } catch (IOException e) {
                e.printStackTrace();
            }
            return tempFile;
        }
    
        private static File tempImageFile(File imageFile) {
            String path = imageFile.getPath();
            StringBuffer strB = new StringBuffer(path);
            strB.insert(path.lastIndexOf('.'),0);
            return new File(strB.toString().replaceFirst("(?<=//.)(//w+)$", "tif"));
        }
    
    }
    package org.img;
    import java.io.BufferedReader;
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.InputStreamReader;
    import java.util.ArrayList;
    import java.util.List;
    
    import org.jdesktop.swingx.util.OS;
    
    public class OCR {
        private final String LANG_OPTION = "-l";  //英文字母小写l,并非数字1
        private final String EOL = System.getProperty("line.separator");
        private String tessPath = "C://Program Files//Tesseract-OCR"; //

    注意这个路径,为安装的tesseract-OCR的路径

        //private String tessPath = new File("tesseract").getAbsolutePath();
        
        public String recognizeText(File imageFile,String imageFormat)throws Exception{
            File tempImage = ImageIOHelper.createImage(imageFile,imageFormat);
            File outputFile = new File(imageFile.getParentFile(),"output");
            StringBuffer strB = new StringBuffer();
            List<String> cmd = new ArrayList<String>();
            if(OS.isWindowsXP()){
                cmd.add(tessPath+"//tesseract");
            }else if(OS.isLinux()){
                cmd.add("tesseract");
            }else{
                cmd.add(tessPath+"//tesseract");
            }
            cmd.add("");
            cmd.add(outputFile.getName());
            cmd.add(LANG_OPTION);
            cmd.add("chi_sim");
            //cmd.add("eng");
            
            ProcessBuilder pb = new ProcessBuilder();
            pb.directory(imageFile.getParentFile());
            
            cmd.set(1, tempImage.getName());
            pb.command(cmd);
            pb.redirectErrorStream(true);
            
            Process process = pb.start();
            //tesseract.exe 1.jpg 1 -l chi_sim
            int w = process.waitFor();
            
            //删除临时正在工作文件
            tempImage.delete();
            
            if(w==0){
                BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(outputFile.getAbsolutePath()+".txt"),"UTF-8"));
                
                String str;
                while((str = in.readLine())!=null){
                    strB.append(str).append(EOL);
                }
                in.close();
            }else{
                String msg;
                switch(w){
                    case 1:
                        msg = "Errors accessing files.There may be spaces in your image's filename.";
                        break;
                    case 29:
                        msg = "Cannot recongnize the image or its selected region.";
                        break;
                    case 31:
                        msg = "Unsupported image format.";
                        break;
                    default:
                        msg = "Errors occurred.";
                }
                tempImage.delete();
                throw new RuntimeException(msg);
            }
            new File(outputFile.getAbsolutePath()+".txt").delete();
            return strB.toString();
        }
    }
    package org.img;
    import java.io.File;
    import java.io.IOException;
    
    
    public class TestOCR {
    
        /**
         * @param args
         */
        public static void main(String[] args) {
            String path = "D:\temp\img\untitled8.png";   
            try {   
                String valCode = new OCR().recognizeText(new File(path), "png");  
                //6905_1294109277pAj9.jpg
                System.out.println(valCode);   
            } catch (IOException e) {   
                e.printStackTrace();   
            } catch (Exception e) {
                e.printStackTrace();
            }    
        }
    
    }

    对于报错,请检查tessPath 这个参数是否设置正确

    本文参考以下两位的blog:

    http://blog.csdn.net/foamflower/article/details/6110211

    http://blog.csdn.net/zhoushuyan/article/details/5948289

  • 相关阅读:
    meta属性
    博客
    概念术语
    装饰器与生成器
    Linux基础
    线程
    网络编程之socket
    网络编程之网络基础部分

    内置函数(max,min,zip)及文件处理
  • 原文地址:https://www.cnblogs.com/beipiaofeng/p/4225160.html
Copyright © 2011-2022 走看看