通过 PIL 和 Python-tesseract 处理验证码并且进行识别来模拟登陆,在测试中对像素进行增强能显著提升 字符识别的能力
#-*- coding:utf-8 -*- try: from PIL import Image, ImageEnhance except ImportError: import Image import pytesseract import re import requests from lxml import etree import os """ PIL:Python Imaging Library,已经是Python平台事实上的图像处理标准库了。PIL功能非常强大,但API却非常简单易用。 a、Python-tesseract是一个基于google's Tesseract-OCR的独立封装包; b、Python-tesseract功能是识别图片文件中文字,并作为返回参数返回识别结果; c、Python-tesseract默认支持tiff、bmp格式图片,只有在安装PIL之后,才能支持jpeg、gif、png等其他图片格式; # # #img =Image.open('./1bri.jpg') # img = Image.open('./validate.png') # ##图像处理## # #转换为RGB图像 # img = img.convert("RGB") # #PIL图像增强lambda # imgbri=img.point(lambda i : i*1.4) #对每一个像素点进行增强,效果很明显 # code = pytesseract.image_to_string(imgbri,lang='eng') # code = re.sub('W','',code) #re.sub 替换特殊字符为空 # print(code) """ #定义 url,headers codeUrl = 'https://so.xxxxxxx.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx' headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36' } #定义request 会话对象 requestsSession = requests.Session() def getCode(): try: responsePage = requestsSession.get(url=codeUrl,headers=headers).text xpathObj = etree.HTML(responsePage) codeSrc = xpathObj.xpath('//img[@id="imgCode"]/@src') imgUrl = "https://so.gushiwen.cn" + str(codeSrc[0]) codeImg = requestsSession.get(url=imgUrl,headers=headers).content with open('./imgCode.jpg','wb') as fp: fp.write(codeImg) img = Image.open('./imgCode.jpg') #图像处理## #转换为RGB图像 img = img.convert("RGB") #PIL图像增强lambda imgbri=img.point(lambda i : i*1.4) #对每一个像素点进行增强,效果很明显 codePy = pytesseract.image_to_string(imgbri,lang='eng') code = re.sub('W','',codePy) return code except: raise def login(code): try: loginUrl = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx' data = { "__VIEWSTATE":"j/z/lKxfNiw1nRO/l6WdCHHn1M89UMpBP9XLld0+alXWaHrgxsN1ji/XVcpLvnKFEKgkURigXyxl3PVieTvJbySKqvpWp9jg4aqvo5Zb8YyeC0v8PW1i92b/pAI=", "__VIEWSTATEGENERATOR":"C93BE1AE", "from":"http://so.xxxxxx.cn/user/collect.aspx", "email":"xxxxxxx", "pwd":"xxxx", "code":code, "denglu":"登录" } loginPage = requestsSession.post(url=loginUrl,data=data).text with open('./loginPage','w',encoding='utf-8') as lp: lp.write(loginPage) except: raise if __name__ == "__main__": code = getCode() login(code)