zoukankan      html  css  js  c++  java
  • Python3爬虫(十四) 验证码处理

     Infi-chu:

    http://www.cnblogs.com/Infi-chu/

    一、图形验证码识别
    1.使用tesserocr

    import tesserocr
    from PIL import Image

    # 在本地存储一张验证码的图片做测试
    image = Image.open('test.jpg')
    result = tesserocr.image_to_text(image)
    print(result)

    # 直接将文本转为字符串
    import tesserocr
    print(tesserocr.file_to_text('test.jpg'))

    2.处理验证码图片
    convert()方法,可将图片转化为灰度图像、二值化图像

    image = image.convert('L')	# 将图像转化为灰度图像
    image.show()
    image = image.convert('1')	# 将图像转化为二值化图像,二值化阈值默认是127
    
    # 现将图片转化成灰度图像,再转化成二值化图像
    image = image.convert('L')
    threshold = 80	# 设定阈值
    table = []
    for i in range(256):
        if i < threshold:
    		table.append(0)
    	else:
    		table.append(1)
    image = image.point(table,'1')
    image.show()	# 图像变得清晰
    result = tesserocr.image_to_text(image)
    print(result)
    

    二、滑动验证码识别
    滑动验证码就如同用一块拼图去在图片中填充
    1.滑动验证码特点:
    防模拟
    防伪造
    防暴力

    2.如何识别:
    采用浏览器模拟验证

    3.初始化:

    EMAIL = 'test@test.com'
    PASSWORD = '123456'
    
    class CrackGeetest():
        def __init__(self):
    	    self.url = 'https://account.geetest.com/login'
    		self.browser = webdriver.Chome()
    		self.wait = WebDriverWait(self.browser,20)
    		self.email = EMAIL
    		self.pasword = PASSWORD
    

    4.模拟点击:

    # 寻找按钮
    def get_geetest_button(self):
        button = self.wait.until(EC.element_to_be_clickable((BY.CLASS_NAME,'geetest_radar_tip')))
    	return button
    # 点击验证按钮
    button = self.get_geetest_button()
    button.click()
    

    5.识别缺口:
    首先对比原图和现图,利用selenium选取图片元素,得到位置和size,然后获取截图

    # 
    # 获取位置和size
    def position(self):
        img = self.wait.until(EC.persence_of_element_located((By.CLASS_NAME,'geetest_canvas_img')))
    	time.sleep(2)
    	location = img.location
    	size = img.size
    	top,bottom,left,right = location['y'],location['y']+size['height'],location['x'],location['x']+size['width']
    	return (top,bottom,left,right)
    # 获取网页截图
    def get_geetest_image(self,name='captcha.png'):
        top,bottom,left,right = self.get_position()	# 获取图片的位置和宽高,随后返回左上角和右下角的坐标
    	print('验证码位置',top,bottom,left,right)
    	screenshot = self.get_screenshot()	# 得到屏幕目标
    	captcha = screenshot.crop((left,top,right,bottom))
    # 获取第二张图片(带有缺口的图片)
    def get_slider(self):
        slider = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'geetest_slider_button')))
    	return slider
    # 点击后出现接口
    slider = self.get_slider()
    slider.click()
    # 在调用 get_geetest_image()函数获取第二张图,分别命名为img1和img2
    '''
    对比图像的缺口,需要遍历图片的每一个坐标点,获取两张图片对应像素点的RGB数据,如果差距在一定范围内,则代表两个像素相同,接着继续对比下一个像素点。如果差距在一定范围之外,则说明不是相同的像素点,则该位置就是缺口位置
    '''
    def is_pixel_equal(self,img1,img2,x,y):
        # 取两个图片的像素点
    	pixel1 = img1.load()[x,y]
    	pixel2 = img2.load()[x,y]
    	threshold = 60
    	# 两张图RGB的绝对值小于定义的阈值,则代表像素点相同,继续遍历。否则不相同,为缺口位置
    	if abs(pixel1[0] - pixel2[0]) < threshold and abs(pixel1[1] - pixel2[1]) < threshold and abs(pixel1[2] - pixel2[2]) < threshold:
    	    return True
    	else:
    	    return False
    
    def get_gap(self,img1,img2):
        left = 60
    	for i in range(left,img1.size[0]):
    	    for j in range(img1.size[1]):
    			if not self.is_pixel_equal(img1.img2,i,j):	# 判断两个图片的某一点的像素是否相同
    			    left = i
    				return left
    	return left
    

    6.模拟拖动:

    def get_track():
        track = []
    	current = 0
    	mid = distance * 4 / 5
    	t = 0.2
    	v = 0
    	while current < distance:
    		if current < mid:
    		    a = 2
    		else:
    		    a = -3
    		v0 = v
    		v = v0 + a * t
    		x = v0*t+1/2*a*t^2
    		move = v0*t+1/2*a*t^2
    		current += move
    		track.append(round(move))
    	return track
    
    def move_to_gap(self,slider,tracks):
        ActionChains(self.browser).click_and_hold(slider).perform()
    	for x in tracks:
    	    ActionChains(self.browser).move_by_offset(xoffset=x,yoffset=0).perform()
    	time.sleep(0.3)
    	ActionChains(self.browser).release().perform()
    

    1.和12306的验证码类似
    2.思路:
    文字识别、图像识别
    3.使用超级鹰平台识别
    修改Python API

    import requests
    from hashlib import md5
    
    class Chaojiying(obj):
        def __init__(self,username,password,soft_id):
    	    self.username=username
    		self.password=md5(password.encode('utf-8')).hexdigest()
    		self.soft_id=soft_id
    		self.base_params = {
    			'user':self.username,
    			'pass2':self.password,
    			'softid':self.soft_id,
    		}
    		self.headers = {
    			'Connection':'Keep-Alive',
    			'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)'
    		}
    	def post_pic(self,im,codetype):
    	    params = {
    			'codetype':codetype,
    		}
    		params.update(self.base_params)
    		files = {'userfile':('test.jpg',im)}
    		r = requests.post('http://upload.chaojiying.net/Upload/Processing.php',data=params,files=files,headers=self.headers)
    		return r.json()
    	def report_error(self,im_id):
    	    params = {'id':im_id,}
    		params.update(self.base_params)
    		r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php',data=params,headers=self.headers)
    		return r.json()
    

    4.初始化:

    EMAIL = 'test@test.com'
    PASSWORD = ''
    CHAOJIYING_USERNAME='test'
    CHAOJIYING_PASSWORD=''
    CHAOJIYING_SOFT_ID=893590    # 软件ID
    CHAOJIYING_KIND=9102    # 验证码类型
    class CrackTouClick():
        def __init__(self):
    	    self.url='输入要识别的网站'
    		self.browser=webdriver.Chome()
    		self.wait=WebDriverWait(self.browser,20)
    		self.email=EMAIL
    		self.password=PASSWORD
    		self.chaojiying=Chaojiying(CHAOJIYING_USERNAME,CHAOJIYING_PASSWORD,CHAOJIYING_SOFT_ID,CHAOJIYING_KIND)
    

    5.获取验证码:

    def open():
    	self.browser.get(self.url)
    	email=self.wait.until(EC.persence_of_element_located((By.ID,'email')))
    	password=self.wait.until(EC.persence_of_element_located((By.ID,'password')))
    	email.send_keys(self.password)
    def get_touclick_button(self):
        button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'touclick-hod-wrap')))
    	return button
    def get_touclick_element(self):
        element = self.wait.until(EC.persence_of_element_located((By.CLASS_NAME,'touclick-pub-content')))
    	return element
    def get_position(self):
        element=self.get_touclick_element()
        time.sleep(1)
        location=element.location
        size=element.size
        top,bottom,left,right=location['y'],location['y']+size['height'],location['x'],location['x']+size['width']
        return (top,bottom,left,right)
    def get_screenshot(self):
    	screenshot=self.browser.get_screenshot_as_png()
    	screenshot=Image.open(BytesIO(screenshot))
    	return screenshot
    def get_touclick_image(self,name='captcha.png')
        top,bottom,left,right=self.get_position()
    	print('验证码位置',top,bottom,left,right)
    	screenshot = self.get_screenshot()
    	captcha = screenshot.crop((left,top,right,bottom))
    	return captcha
    

    6.识别验证码:

    image = self.get_touclick_image()
    bytes_array=BytesIO()
    image.save(bytes_array,format='PNG')
    res = self.chaojiying.post_pic(bytes_array,getvalue(),CHAOJIYING_KIND)
    print(res)
    def get_points(self,captcha_result):
        groups=captcha_result.get('pic_str').split('|')
    	locations=[[int(number) for number in group.split(',')]for group in groups]
    	return locations
    def touch_click_words(self,locations):
        for location in locations:
    	    print(location)
    		ActionChains(self.browser).move_to_element_with_offset(self.get_touclick_element(),location[0],location[1]).click().perform()
    		time.sleep(1)
    
  • 相关阅读:
    进入全屏 nodejs+express+mysql实现restful风格的增删改查示例
    WebAPI 实现前后端分离
    android 集成支付宝app支付(原生态)-包括android前端与java后台
    Windows 64 位系统下 Python 环境的搭建
    Es6主要特征详解
    js上传图片
    Python socket
    设置windows开机自启某个软件
    oracle导入导出数据
    mysql触发器,答题记录表同步教学跟踪(用户列表)
  • 原文地址:https://www.cnblogs.com/Infi-chu/p/8991810.html
Copyright © 2011-2022 走看看