zoukankan      html  css  js  c++  java
  • 几个有用的脚本备记

    tesseract sh训练脚本

    #! /bin/bash
    
    # build the environment
    mkdir tessenv; cd tessenv 
    TROOT=`pwd` 
    mkdir $TROOT/stockfonts; mkdir $TROOT/build; mkdir $TROOT/build/eng 
    echo "Environment built"
    # Get the stock english fonts from Google (old, but they work)
    cd $TROOT/stockfonts
    GET http://tesseract-ocr.googlecode.com/files/boxtiff-2.01.eng.tar.gz > boxtiff-2.01.eng.tar.gz
    echo "Google box/tiff tar.gz loaded"
    
    
    # unpack the fonts, a new english (eng) directory is created with tif/box files
    tar -xzf boxtiff-2.01.eng.tar.gz
    echo "box/tiff file unpacked"
    
    
    # Move the arial font data into the build space (yes, the exp0 is required)
    mv $TROOT/stockfonts/eng/eng.arial.g4.tif $TROOT/build/eng.arial.exp0.tif
    mv $TROOT/stockfonts/eng/eng.arial.box $TROOT/build/eng.arial.exp0.box
    echo "ariel box/tif moved and renamed"
    cd $TROOT/build
    # Create the font_properties file
    echo "arial 0 0 0 0 0" > font_properties
    
    # BEGIN BUILDING NEW eng.traineddata
    tesseract eng.arial.exp0.tif eng.arial.exp0 nobatch box.train
    unicharset_extractor eng.arial.exp0.box
    shapeclustering -F font_properties -U unicharset  eng.arial.exp0.tr
    mftraining -F font_properties -U unicharset -O eng.unicharset eng.arial.exp0.tr
    cntraining eng.arial.exp0.tr
    echo "eng.traineddata complete"
    
    # BEGIN combining into an eng.traineddata set
    # Note files are moved into an isoloated directory for combiing
    # Note files have language prefix added
    
    cp eng.unicharset $TROOT/build/eng/eng.unicharset
    cp normproto $TROOT/build/eng/eng.normproto
    cp inttemp $TROOT/build/eng/eng.inttemp
    cp pffmtable $TROOT/build/eng/eng.pffmtable
    cp shapetable $TROOT/build/eng/eng.shapetable
    
    cd $TROOT/build/eng
    combine_tessdata eng.
    
    # You now have an eng.trainedddata file in your $TROOT/build/eng directory
    # You must move this file to your /usr/local/share/tessdata directory.
    # You will need sudo permission. 
    # BE SURE to back up your old eng.traineddata FIRST
    # Recommend testing your new tesseract with the eng.arial.exp0.tif file in
    # the build directory.

    opencv 文本图片预处理

    # -*- coding: UTF-8 -*-
    import cv2
    def  digitsimg(src):
        
        #灰度化
        img_gray = cv2.cvtColor(src,cv2.COLOR_BGR2GRAY)
        #Otsu thresholding 二值化
        ret,result= cv2.threshold(img_gray,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
        #腐蚀去除一些小的点
        kernel = cv2.getStructuringElement(cv2.MORPH_CROSS,(3,2))
        eroded = cv2.erode(result,kernel)
        #将结果放大便于识别
        result = cv2.resize(result,(128,128),interpolation=cv2.INTER_CUBIC)
       # cv2.imshow('result',result)
       # cv2.waitKey(0)
        #腐蚀去除放大后的一些小的点
        eroded = cv2.erode(result,kernel)
      #  cv2.imshow('eroded',eroded)
      #  cv2.waitKey(0)
        #膨胀使数字更饱满
        result = cv2.dilate(eroded,kernel)
     #   cv2.imshow('dilated',result)
        #直方图均衡化使图像更清晰
        cv2.equalizeHist(result)
        #中值滤波去除噪点
        result = cv2.medianBlur(result,5)
    #    cv2.imshow('median',result)
     #   cv2.waitKey(0)
        return result
    '''
    def chineseimg(src):
        
        #灰度化
        img_gray = cv2.cvtColor(src,cv2.COLOR_BGR2GRAY)
        #Otsu thresholding 二值化
        ret,result= cv2.threshold(img_gray,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
      #  cv2.imshow('otsu',result)
      #  cv2.waitKey(0)
        #直方图均衡化使图像更清晰
        cv2.equalizeHist(result)
      #  cv2.imshow('直方图',result)
     #   cv2.waitKey(0)
        return result
        #将结果放大便于识别
        result = cv2.resize(result,(256,128),interpolation=cv2.INTER_CUBIC)
        #腐蚀去除放大后的一些小的点
        kernel = cv2.getStructuringElement(cv2.MORPH_CROSS,(3,2))
        eroded = cv2.erode(result,kernel)
        cv2.imshow('eroded',eroded)
        cv2.waitKey(0)
        #膨胀使数字更饱满
        result = cv2.dilate(eroded,kernel)
        cv2.imshow('dilated',result)
        cv2.waitKey(0)
        #直方图均衡化使图像更清晰
        cv2.equalizeHist(result)
        #中值滤波去除噪点
        result = cv2.medianBlur(result,5)
        cv2.imshow('median',result)
        cv2.waitKey(0)'''
        

    https://coding.net/u/mengning/p/np2016/git/blob/master/BloodTestReportOCR/imgproc.py

  • 相关阅读:
    .Net4.0并行库介绍——线程专有存储
    解决WPF中TextBox文件拖放问题
    POJ 2063 Investment(完全背包)
    HDU 1698 Just a Hook(线段树,成段更新)
    HDU 2665 Kth number(划分树入门题,纯套模板)
    HDU 4251 The Famous ICPC Team Again(划分树入门题)
    POJ 3295 Tautology(构造法)
    HDU 1540 Tunnel Warfare(线段树,去最大连续区间)
    HDU 1394 Minimum Inversion Number(求逆序数,线段树或者树状数组)
    POJ 2993 Emag eht htiw Em Pleh(水模拟)
  • 原文地址:https://www.cnblogs.com/jkmiao/p/6808585.html
Copyright © 2011-2022 走看看