zoukankan      html  css  js  c++  java
  • 《python3网络爬虫开发实战》--验证码的识别

    1.图形验证码:

    中国知网:http://my.cnki.net/elibRegister/CommonRegister.aspx

     1 import tesserocr
     2 from PIL import Image
     3 
     4 image = Image.open('code2.jpg')
     5 image = image.convert('L')
     6 threshold = 180
     7 table = []
     8 for i in range(256):
     9     if i < threshold:
    10         table.append(0)
    11     else:
    12         table.append(1)
    13 
    14 image = image.point(table, '1')
    15 #image = image.convert('1')
    16 #image.show()
    17 
    18 result = tesserocr.image_to_text(image)
    19 print(result)

    2. 极验滑动验证码的识别

    https://www.geetest.com/Sensebot

    对于应用了极验验证码的网站如果我们直接模拟表单提交,加密参数的构造是个问题,需要分析其加密和校验逻辑,相对烦琐 。 所以我们采用直接模拟浏览器动作的方式来完成验证 。

    可以使用 Selenium来完全模拟人的行为的方式来完成验证,此验证成本相比直接去识别加密算法少很多 。

    https://account.geetest.com/login

    (I)模拟点击验证按钮。

    (2)识别附动缺口的位置 。

    (3)模拟拖动滑块 。

      1 import time
      2 from io import BytesIO
      3 from PIL import Image
      4 from selenium import webdriver
      5 from selenium.webdriver import ActionChains
      6 from selenium.webdriver.common.by import By
      7 from selenium.webdriver.support.ui import WebDriverWait
      8 from selenium.webdriver.support import expected_conditions as EC
      9 
     10 EMAIL = 'zcs@163.com'
     11 PASSWORD = '123'
     12 BORDER = 6
     13 #INIT_LEFT = 60
     14 
     15 
     16 class CrackGeetest():
     17     def __init__(self):
     18         self.url = 'https://account.geetest.com/login'
     19         self.browser = webdriver.Chrome()
     20         self.wait = WebDriverWait(self.browser, 20)
     21         self.email = EMAIL
     22         self.password = PASSWORD
     23 
     24     def __del__(self):
     25         self.browser.close()
     26 
     27     def get_geetest_button(self):
     28         """
     29         获取初始验证按钮
     30         :return:
     31         """
     32         button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'geetest_radar_tip')))
     33         return button
     34 
     35     def get_position(self):
     36         """
     37         获取验证码位置
     38         :return: 验证码位置元组
     39         """
     40         img = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'geetest_canvas_img')))
     41         time.sleep(2)
     42         location = img.location
     43         size = img.size
     44         top, bottom, left, right = location['y'], location['y'] + size['height'], location['x'], location['x'] + size[
     45             'width']
     46         return (top, bottom, left, right)
     47 
     48     def get_screenshot(self):
     49         """
     50         获取网页截图
     51         :return: 截图对象
     52         """
     53         screenshot = self.browser.get_screenshot_as_png()
     54         screenshot = Image.open(BytesIO(screenshot))
     55         return screenshot
     56 
     57     def get_slider(self):
     58         """
     59         获取滑块
     60         :return: 滑块对象
     61         """
     62         slider = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'geetest_slider_button')))
     63         return slider
     64 
     65     def get_geetest_image(self, name='captcha.png'):
     66         """
     67         获取验证码图片
     68         :return: 图片对象
     69         """
     70         top, bottom, left, right = self.get_position()
     71         print('验证码位置', top, bottom, left, right)
     72         screenshot = self.get_screenshot()
     73         # crop将图片裁剪
     74         captcha = screenshot.crop((left, top, right, bottom))
     75         captcha.save(name)
     76         return captcha
     77 
     78     def open(self):
     79         """
     80         打开网页输入用户名密码
     81         :return: None
     82         """
     83         self.browser.get(self.url)
     84         email = self.wait.until(EC.presence_of_element_located((By.ID, 'email')))
     85         password = self.wait.until(EC.presence_of_element_located((By.ID, 'password')))
     86         email.send_keys(self.email)
     87         password.send_keys(self.password)
     88 
     89     def get_gap(self, image1, image2):
     90         """
     91         获取缺口偏移量
     92         :param image1: 不带缺口图片
     93         :param image2: 带缺口图片
     94         :return:
     95         """
     96         left = 60
     97         for i in range(left, image1.size[0]):
     98             for j in range(image1.size[1]):
     99                 if not self.is_pixel_equal(image1, image2, i, j):
    100                     left = i
    101                     return left
    102         return left
    103 
    104     def is_pixel_equal(self, image1, image2, x, y):
    105         """
    106         判断两个像素是否相同
    107         :param image1: 图片1
    108         :param image2: 图片2
    109         :param x: 位置x
    110         :param y: 位置y
    111         :return: 像素是否相同
    112         """
    113         # 取两个图片的像素点
    114         pixel1 = image1.load()[x, y]
    115         pixel2 = image2.load()[x, y]
    116         threshold = 60
    117         if abs(pixel1[0] - pixel2[0]) < threshold and abs(pixel1[1] - pixel2[1]) < threshold and abs(
    118                 pixel1[2] - pixel2[2]) < threshold:
    119             return True
    120         else:
    121             return False
    122 
    123     def get_track(self, distance):
    124         """
    125         根据偏移量获取移动轨迹
    126         :param distance: 偏移量
    127         :return: 移动轨迹
    128         """
    129         # 移动轨迹
    130         track = []
    131         # 当前位移
    132         current = 0
    133         # 减速阈值
    134         mid = distance * 4 / 5
    135         # 计算间隔
    136         t = 0.2
    137         # 初速度
    138         v = 0
    139 
    140         while current < distance:
    141             if current < mid:
    142                 # 加速度为正2
    143                 a = 2
    144             else:
    145                 # 加速度为负3
    146                 a = -3
    147             # 初速度v0
    148             v0 = v
    149             # 当前速度v = v0 + at
    150             v = v0 + a * t
    151             # 移动距离x = v0t + 1/2 * a * t^2
    152             move = v0 * t + 1 / 2 * a * t * t
    153             # 当前位移
    154             current += move
    155             # 加入轨迹
    156             track.append(round(move))
    157         return track
    158 
    159     def move_to_gap(self, slider, track):
    160         """
    161         拖动滑块到缺口处
    162         :param slider: 滑块
    163         :param track: 轨迹
    164         :return:
    165         """
    166         ActionChains(self.browser).click_and_hold(slider).perform()
    167         for x in track:
    168             ActionChains(self.browser).move_by_offset(xoffset=x, yoffset=0).perform()
    169         time.sleep(0.5)
    170         ActionChains(self.browser).release().perform()
    171 
    172     def login(self):
    173         """
    174         登录
    175         :return: None
    176         """
    177         submit = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'login-btn')))
    178         submit.click()
    179         time.sleep(10)
    180         print('登录成功')
    181 
    182     def crack(self):
    183         # 输入用户名密码
    184         self.open()
    185         # 点击验证按钮
    186         button = self.get_geetest_button()
    187         button.click()
    188         # 获取验证码图片
    189         image1 = self.get_geetest_image('captcha1.png')
    190         # 点按呼出缺口
    191         slider = self.get_slider()
    192         slider.click()
    193         # 获取带缺口的验证码图片
    194         image2 = self.get_geetest_image('captcha2.png')
    195         # 获取缺口位置
    196         gap = self.get_gap(image1, image2)
    197         print('缺口位置', gap)
    198         # 减去缺口位移
    199         gap -= BORDER
    200         # 获取移动轨迹
    201         track = self.get_track(gap)
    202         print('滑动轨迹', track)
    203         # 拖动滑块
    204         self.move_to_gap(slider, track)
    205 
    206         success = self.wait.until(
    207             EC.text_to_be_present_in_element((By.CLASS_NAME, 'geetest_success_radar_tip_content'), '验证成功'))
    208         print(success)
    209 
    210         # 失败后重试
    211         if not success:
    212             self.crack()
    213         else:
    214             self.login()
    215 
    216 
    217 if __name__ == '__main__':
    218     crack = CrackGeetest()
    219     crack.crack()

    但是,当我们截取图片的时候,网站将图片分割为不同的图片随机组合,我们就无法使用这一方法。

    3.点触验证码的识别

    点触的网址挂了,

    4. 微博宫格识别

  • 相关阅读:
    2020.4.21 考试T1 HDU 5729
    BZOJ 4198: [Noi2015]荷马史诗
    BZOJ 1052: [HAOI2007]覆盖问题
    BZOJ 1087: [SCOI2005]互不侵犯King
    BZOJ 4466 线性函数
    Linux如何挂载U盘
    集中式日志分析平台
    ELK5.2+kafka+zookeeper+filebeat集群部署
    浅析ES的_source、_all、store、index
    IndexOf、LastIndexOf、Substring的用法
  • 原文地址:https://www.cnblogs.com/chengchengaqin/p/9655270.html
Copyright © 2011-2022 走看看