安装
tesseract.exe 下载:https://digi.bib.uni-mannheim.de/tesseract/
chi_sim.traineddata:https://github.com/tesseract-ocr/tessdata/find/master/chi_sim.traineddata
jTessBoxEditor2.0工具,用于调整图片上文字的内容和位置:https://www.jianshu.com/p/c8ba23ec672a
pip install pillow pytesseract
测试代码
1 # !/usr/bin/env python 2 # coding:utf-8 3 import requests 4 # from lxml import etree 5 import time 6 import os 7 import re 8 import pytesseract 9 from PIL import Image, ImageEnhance 10 11 12 def binarizing(img, threshold): # #二值化。i参数 灰度图,阈值 13 pixdata = img.load() 14 w, h = img.size 15 for y in range(h): 16 for x in range(w): 17 if pixdata[x, y] < threshold: 18 pixdata[x, y] = 0 19 else: 20 pixdata[x, y] = 255 21 return img 22 23 24 def Compute_Correct_Rate(Correct, Recognition): # 计算正确率,打印识别错误的 25 Correct_num = 0 26 for i in range(len(Correct)): 27 if Correct[i].lower() == Recognition[i].lower(): 28 Correct_num += 1 29 else: 30 print('{} is Wrong,result is {}'.format(Recognition[i], Correct[i])) 31 print('the Correct Rate is ', Correct_num / len(Correct)) 32 33 34 def process_img(rootPath): 35 Correct_result = [] 36 Recognition_result = [] 37 for root, dirs, files in os.walk(rootPath): # 遍历整个文件夹,识别所有图片 38 for file in files: 39 filepath = os.path.join(rootPath, file) 40 img = Image.open(filepath) # 打开图片文件 41 # imgry = img.convert('L') # 转化为灰度图 42 # img = binarizing(imgry, 130) 43 img = img.convert('RGB') # 这里也可以尝试使用L 44 enhancer = ImageEnhance.Color(img) 45 enhancer = enhancer.enhance(0) 46 enhancer = ImageEnhance.Brightness(enhancer) 47 enhancer = enhancer.enhance(2) 48 enhancer = ImageEnhance.Contrast(enhancer) 49 enhancer = enhancer.enhance(8) 50 enhancer = ImageEnhance.Sharpness(enhancer) 51 img = enhancer.enhance(20) 52 text = pytesseract.image_to_string(img, lang='eng') 53 match = re.findall(r'[w]+', text) 54 if len(match) != 0: 55 r_result = ''.join(match) 56 else: 57 r_result = '' 58 c_result = os.path.splitext(file)[0] 59 Correct_result.append(c_result) 60 Recognition_result.append(r_result) 61 62 Compute_Correct_Rate(Correct_result, Recognition_result) 63 64 65 if __name__ == '__main__': 66 process_img(r"D:dxxsoftwareprojectDevToolsMyTestImagePath")
遇到问题
1.没有安装tesseract.exe
1 第一次用Pycharm使用tesseract的时候 需要修改一下pytesseract.py中的tesseract_cmd指向的路径 2 c:py368Libsite-packagespytesseractpytesseract.py文件中修改 3 tesseract_cmd = ‘G:/soft/Tesseract-OCR验证码识别/tesseract’ 4 5 tesseract需要额外安装 6 链接:https://pan.baidu.com/s/12kfe9bRCj8vFd5f9h5iPCw 7 提取码:dutl 8 复制这段内容后打开百度网盘手机App,操作更方便哦 9 10 PyCharm-执行程序PermissionError: [WinError 5] 拒绝访问 11 cmd命令需要精确到exe文件 12 13 tesseract_cmd = ‘G:/soft/Tesseract-OCR验证码识别/tesseract.exe’ 14 增加环境变量TESSDATA_PREFIX G:/soft/Tesseract-OCR验证码识别/tessdata 15 系统Path 变量增加G:/soft/Tesseract-OCR验证码识别/ 16 打开命令终端,输入:tesseract -v,可以看到版本信息 17 用命令tesseract --list-langs来查看Tesseract-OCR支持语言
参考链接:
https://www.cnblogs.com/jclian91/p/9158372.html
https://www.jianshu.com/p/c8ba23ec672a
https://zhuanlan.zhihu.com/p/113961004
https://www.cnblogs.com/lizhe860/p/8969171.html
https://digi.bib.uni-mannheim.de/tesseract/
https://blog.csdn.net/m0_46498587/article/details/109255325
样例:https://www.pianshen.com/article/84901303763/
https://www.cnblogs.com/pythonywy/archive/2020/02/04/12258681.html