zoukankan      html  css  js  c++  java
  • 爬虫之Selenium模块

    1.Selenium模块介绍

      该模块最初是一个自动化测试模块,而在爬虫中使用它是为了解决requests无法直接执行JavaScript代码的问题

      本质是驱动浏览器,完全模拟浏览器的操作,如跳转、输入、点击、下拉等

      selenium支持的多种浏览器:

    from selenium import webdriver
    browser=webdriver.Chrome()
    browser=webdriver.Firefox()
    browser=webdriver.PhantomJS()   # 一个无界面的浏览器
    browser=webdriver.Safari()
    browser=webdriver.Edge() 

      它的官方文档链接:https://selenium-python.readthedocs.io/

    1.1 安装

    针对有界面的浏览器:

    安装链接:http://npm.taobao.org/mirrors/chromedriver/

    下载chromdriver.exe,把它放到python安装路径的scripts目录中即可(本人windows)

    对于mac系统,将解压后的chromedriver移动到/usr/local/bin目录下

    然后再我们的python中安装pip包

    pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple selenium

     注意:selenium3默认支持的webdriver是Firfox,而Firefox需要安装geckodriver 下载链接

    对于无界面的浏览器phantomjs,该浏览器已经不再提供更新,不需要再去研究,现在基本无法使用了。

    但是Google 自发布 chrome 59 / 60 正式版 开始便支持Headless mode ,

    这意味着在无 GUI 环境下, PhantomJS 不再是唯一选择 ,我们完全可以通过使用谷歌来达到相同的效果

    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    chrome_options = Options()
    chrome_options.add_argument('window-size=1920x3000') #指定浏览器分辨率
    chrome_options.add_argument('--disable-gpu') #谷歌文档提到需要加上这个属性来规避bug
    chrome_options.add_argument('--hide-scrollbars') #隐藏滚动条, 应对一些特殊页面
    chrome_options.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度
    chrome_options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
    chrome_options.binary_location = r"C:Program Files (x86)GoogleChromeApplicationchrome.exe" #手动指定使用的浏览器位置
    
    
    driver=webdriver.Chrome(chrome_options=chrome_options)
    driver.get('https://www.baidu.com')
    
    print('hao123' in driver.page_source)
    
    
    driver.close() #切记关闭浏览器,回收资源

    无图模式的使用(没有弹窗,但是代码依旧可以运行)

    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    req_url = "https://www.baidu.com"
    chrome_options=Options()
    #设置chrome浏览器无界面模式
    chrome_options.add_argument('--headless')
    browser = webdriver.Chrome(chrome_options=chrome_options)
    # 开始请求
    browser.get(req_url)
    #打印页面源代码
    print(browser.page_source)
    #关闭浏览器
    browser.close()
    #关闭chreomedriver进程
    browser.quit()

     

    2.使用

     模拟访问百度浏览器,并输入onepiece搜索

    指定一个加载后的资源作为搜索结果

    代码

    import time
    from selenium import webdriver
    from selenium.webdriver.common.by import By  # 按照什么方式去查找
    from selenium.webdriver.common.keys import Keys  # 键盘按键操作
    from selenium.webdriver.support import expected_conditions as E
    from selenium.webdriver.support.wait import WebDriverWait
    
    browser = webdriver.Chrome()
    try:
        browser.get('https://www.baidu.com/')
        input_tag = browser.find_element_by_id('kw')  # 查找id=kw的标签
        input_tag.send_keys('onepiece')  # 输入关键字进行搜索
        input_tag.send_keys(Keys.ENTER)  # 回车按钮
        wait = WebDriverWait(browser, 10)  # 等待资源加载
        wait.until(E.presence_of_element_located((By.ID, 'container')))  # 直到当中指定id加载出来
        time.sleep(3)
    finally:
        browser.close()  # 关闭浏览器

    2.1 选择器

    1.基本用法

    from selenium import webdriver
    from selenium.webdriver import ActionChains
    from selenium.webdriver.common.by import By  # 按照什么方式查找,By.ID,By.CSS_SELECTOR
    from selenium.webdriver.common.keys import Keys  # 键盘按键操作
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
    import time
    
    driver = webdriver.Chrome()
    driver.get('https://www.baidu.com')
    wait = WebDriverWait(driver, 10)
    
    try:
        # ===============所有方法===================
        # 1、find_element_by_id
        # 2、find_element_by_link_text
        # 3、find_element_by_partial_link_text
        # 4、find_element_by_tag_name
        # 5、find_element_by_class_name
        # 6、find_element_by_name
        # 7、find_element_by_css_selector
        # 8、find_element_by_xpath
        # 强调:
        # 1、上述均可以改写成find_element(By.ID,'kw')的形式
        # 2、find_elements_by_xxx的形式是查找到多个元素,结果为列表
    
        # ===============示范用法===================
        # 1、find_element_by_id
        print(driver.find_element_by_id('kw'))      # 搜索框id
    
        # 2、find_element_by_link_text
        # login=driver.find_element_by_link_text('登录')  # 文本搜索
        # login.click()
    
        # 3、find_element_by_partial_link_text
        login = driver.find_elements_by_partial_link_text('')[0]
        # login.click()
    
        # 4、find_element_by_tag_name
        print(driver.find_element_by_tag_name('a'))     # 标签名搜索
    
        # 5、find_element_by_class_name
        button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'tang-pass-footerBarULogin')))
        button.click()
    
        # 6、find_element_by_name
        input_user = wait.until(EC.presence_of_element_located((By.NAME, 'userName')))
        input_pwd = wait.until(EC.presence_of_element_located((By.NAME, 'password')))
        commit = wait.until(EC.element_to_be_clickable((By.ID, 'TANGRAM__PSP_10__submit')))
    
        input_user.send_keys('用户名')
        input_pwd.send_keys('密码')
        commit.click()  # 提交
    
        # 7、find_element_by_css_selector
        driver.find_element_by_css_selector('#kw')
    
        # 8、find_element_by_xpath
    
        time.sleep(5)
    
    finally:
        driver.close()

    2.2 标签属性获取

    from selenium import webdriver
    from selenium.webdriver.common.by import By  # 按照什么方式查找,By.ID,By.CSS_SELECTOR
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
    
    browser = webdriver.Chrome()
    
    browser.get('https://www.amazon.cn/')
    
    wait = WebDriverWait(browser, 10)
    wait.until(EC.presence_of_element_located((By.ID, 'cc-lm-tcgShowImgContainer')))
    
    tag = browser.find_element(By.CSS_SELECTOR, '#cc-lm-tcgShowImgContainer img')
    
    # 获取标签属性,
    print(tag.get_attribute('src'))
    # https://images-cn.ssl-images-amazon.com
    # /images/G/28/kindle/design/2018/Device/ys_180925_ATF1500x300_travel_dvc._CB484178544_.jpg

    # 获取标签ID,位置,名称,大小 print(tag.id) # 0.5932406187872517-2 print(tag.location) # {'x': 0, 'y': 0} print(tag.tag_name) # img print(tag.size) # {'height': 0, 'width': 0} browser.close()

     2.3 节点交互

      通过控制节点,实现通过我们的程序实现浏览器的交互

    from selenium import webdriver
    import time
    
    browser = webdriver.Chrome()
    browser.get('https://www.taobao.com')  # 打开淘宝
    input = browser.find_element_by_id('q')  # 查询id属性为q的标签,也就是input搜索框!
    input.send_keys('MAC')  # 输入文字MAC
    time.sleep(1)  # 等待1秒
    input.clear()  # 清空文字
    input.send_keys('IPhone')  # 输入文字IPhone
    # 查询class属性为btn-search的标签,它是一个button标签
    button = browser.find_element_by_class_name('btn-search')
    button.click()  # 点击搜索
    browser.close()

    2.4 动作链 ————>滑动验证码破解

      将某个节点(文本)从一处拖拽到另一处

      右边的窗口是一个iframe框架,它拥有独立的html标签。因此使用selenium时,需要使用switch_to.frame切换到frame才行,否则找不到

    from selenium import webdriver
    from selenium.webdriver import ActionChains
    import time
    
    browser = webdriver.Chrome()
    url = 'http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'
    browser.get(url)  # 访问网页
    browser.switch_to.frame('iframeResult')  # 切换到frame,找到id为iframeResult的元素
    source = browser.find_element_by_css_selector('#draggable')  # 起始标签
    target = browser.find_element_by_css_selector('#droppable')  # 目的标签
    actions = ActionChains(browser)  # 创建动作链
    # actions.drag_and_drop(source, target)
    actions.click_and_hold(source)  # 点击鼠标左键,按住不放
    time.sleep(0.5)
    
    # 模拟匀速运动
    for i in range(5):
        # 移动坐标,xOffset 为横坐标,yOffset 为纵坐标
        # 调用perform()方法时,队列中的时间会依次执行。它会执行链中的所有动作
        actions.move_by_offset(xoffset=17,yoffset=0).perform()
        time.sleep(0.5)
    
    actions.release()  # 松开鼠标左键
    browser.close()

    样式:

    3. Javascript执行

      使用execute_script()方法执行下拉进度条任务以及alert任务

    import time
    from selenium import webdriver
    
    browser = webdriver.Chrome()
    browser.get('https://www.jd.com/')
    time.sleep(5)   # 这样下拉下过才比较明显
    # window.scrollTo表示右侧的滑动条,括号的参数表示从上面直接拉到最底下
    browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
    browser.execute_script('alert("123")')  # 弹出123

     4.前进和后退

      使用back()方法后退,使用forward()方法前进

    import time
    from selenium import webdriver
    
    browser = webdriver.Chrome()
    browser.get('https://www.baidu.com')
    browser.get('https://www.taobao.com')
    browser.get('http://www.qq.com/')
    
    browser.back()  # Goes one step backward in the browser history.
    time.sleep(1)
    browser.forward()  # Goes one step forward in the browser history.
    browser.close()  # Closes the current window.

    5.操作cookies

      使用selenium,我们还可以获取,删除,修改cookies

    from selenium import webdriver
    
    browser = webdriver.Chrome()
    browser.get('https://www.zhihu.com/explore')
    # 打印知乎设置的cookie
    print(browser.get_cookies())
    # 增加3个cookie
    browser.add_cookie({'name': 'name', 'domain': 'www.zhihu.com', 'value': 'germey'})
    # 打印增加之后的所有cookie
    print(browser.get_cookies())
    # 清空此页面的所有的cookie
    browser.delete_all_cookies()
    print(browser.get_cookies())  # []
    browser.close()

    6.虎嗅网滑动验证码操作

      操作主要是利用了灰度差,程序截取了前后两张图片,从左上角到右下角进行循环,进行灰度比较(RGB差值),从而找见缺口,并测出需要移动的距离。然后利用函数模拟人为移动滑块

    from selenium import webdriver
    from selenium.webdriver.support.ui import WebDriverWait  # 等待元素加载的
    from selenium.webdriver.common.action_chains import ActionChains  # 拖拽
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.common.exceptions import TimeoutException
    from selenium.webdriver.common.by import By
    from PIL import Image
    import requests
    import re
    import random
    from io import BytesIO
    import time
    
    
    def merge_image(image_file, location_list):
        """
         拼接图片
        """
        im = Image.open(image_file)
        im.save('code.jpg')
        new_im = Image.new('RGB', (260, 116))
        # 把无序的图片 切成52张小图片
        im_list_upper = []
        im_list_down = []
        # print(location_list)
        for location in location_list:
            # print(location['y'])
            if location['y'] == -58:  # 上半边
                im_list_upper.append(im.crop((abs(location['x']), 58, abs(location['x']) + 10, 116)))
            if location['y'] == 0:  # 下半边
                im_list_down.append(im.crop((abs(location['x']), 0, abs(location['x']) + 10, 58)))
    
        x_offset = 0
        for im in im_list_upper:
            new_im.paste(im, (x_offset, 0))  # 把小图片放到 新的空白图片上
            x_offset += im.size[0]
    
        x_offset = 0
        for im in im_list_down:
            new_im.paste(im, (x_offset, 58))
            x_offset += im.size[0]
        # new_im.show()
        return new_im
    
    
    def get_image(driver, div_path):
        '''
        下载无序的图片  然后进行拼接 获得完整的图片
        :param driver:
        :param div_path:
        :return:
        '''
        background_images = driver.find_elements_by_xpath(div_path)
        location_list = []
        for background_image in background_images:
            location = {}
            result = re.findall('background-image: url("(.*?)"); background-position: (.*?)px (.*?)px;',
                                background_image.get_attribute('style'))
            # print(result)
            location['x'] = int(result[0][1])
            location['y'] = int(result[0][2])
    
            image_url = result[0][0]
            location_list.append(location)
        image_url = image_url.replace('webp', 'jpg')
        # '替换url http://static.geetest.com/pictures/gt/579066de6/579066de6.webp'
        image_result = requests.get(image_url).content
        image_file = BytesIO(image_result)  # 是一张无序的图片
        image = merge_image(image_file, location_list)
    
        return image
    
    
    def get_track(distance):
        # 初速度
        v = 0
        # 单位时间为0.2s来统计轨迹,轨迹即0.2内的位移
        t = 0.2
        # 位移/轨迹列表,列表内的一个元素代表0.2s的位移
        tracks = []
        tracks_back = []
        # 当前的位移
        current = 0
        # 到达mid值开始减速
        mid = distance * 7 / 8
        print("distance", distance)
        global random_int
        random_int = 8
        distance += random_int  # 先滑过一点,最后再反着滑动回来
    
        while current < distance:
            if current < mid:
                # 加速度越小,单位时间的位移越小,模拟的轨迹就越多越详细
                a = random.randint(2, 5)  # 加速运动
            else:
                a = -random.randint(2, 5)  # 减速运动
            # 初速度
            v0 = v
            # 0.2秒时间内的位移
            s = v0 * t + 0.5 * a * (t ** 2)
            # 当前的位置
            current += s
            # 添加到轨迹列表
            if round(s) > 0:
                tracks.append(round(s))
            else:
                tracks_back.append(round(s))
    
            # 速度已经达到v,该速度作为下次的初速度
            v = v0 + a * t
    
            print("tracks:", tracks)
            print("tracks_back:", tracks_back)
            print("current:", current)
    
        # 反着滑动到大概准确位置
    
        tracks_back.append(distance - current)
        tracks_back.extend([-2, -5, -8, ])
    
        return tracks, tracks_back
    
    
    def get_distance(image1, image2):
        '''
           拿到滑动验证码需要移动的距离
          :param image1:没有缺口的图片对象
          :param image2:带缺口的图片对象
          :return:需要移动的距离
          '''
        # print('size', image1.size)
    
        threshold = 50
        for i in range(0, image1.size[0]):  # 260
            for j in range(0, image1.size[1]):  # 160
                pixel1 = image1.getpixel((i, j))
                pixel2 = image2.getpixel((i, j))
                res_R = abs(pixel1[0] - pixel2[0])  # 计算RGB差
                res_G = abs(pixel1[1] - pixel2[1])  # 计算RGB差
                res_B = abs(pixel1[2] - pixel2[2])  # 计算RGB差
                if res_R > threshold and res_G > threshold and res_B > threshold:
                    return i  # 需要移动的距离
    
    
    def main_check_code(driver, element):
        """
        拖动识别验证码
        :param driver:
        :param element:
        :return:
        """
    
        login_btn = driver.find_element_by_class_name('js-login')
        login_btn.click()
    
        element = WebDriverWait(driver, 30, 0.5).until(EC.element_to_be_clickable((By.CLASS_NAME, 'gt_guide_tip')))
        slide_btn = driver.find_element_by_class_name('gt_guide_tip')
        slide_btn.click()
    
        image1 = get_image(driver, '//div[@class="gt_cut_bg gt_show"]/div')
        image2 = get_image(driver, '//div[@class="gt_cut_fullbg gt_show"]/div')
        # 图片上 缺口的位置的x坐标
    
        # 2 对比两张图片的所有RBG像素点,得到不一样像素点的x值,即要移动的距离
        l = get_distance(image1, image2)
        print('l=', l)
    
        # 3 获得移动轨迹
        track_list = get_track(l)
        print('第一步,点击滑动按钮')
        element = WebDriverWait(driver, 30, 0.5).until(EC.element_to_be_clickable((By.CLASS_NAME, 'gt_slider_knob')))
        ActionChains(driver).click_and_hold(on_element=element).perform()  # 点击鼠标左键,按住不放
        import time
        time.sleep(0.4)
        print('第二步,拖动元素')
        for track in track_list[0]:
            ActionChains(driver).move_by_offset(xoffset=track, yoffset=0).perform()  # 鼠标移动到距离当前位置(x,y)
        # time.sleep(0.4)
        for track in track_list[1]:
            ActionChains(driver).move_by_offset(xoffset=track, yoffset=0).perform()  # 鼠标移动到距离当前位置(x,y)
            time.sleep(0.1)
        import time
        time.sleep(0.6)
        # ActionChains(driver).move_by_offset(xoffset=2, yoffset=0).perform()  # 鼠标移动到距离当前位置(x,y)
        # ActionChains(driver).move_by_offset(xoffset=8, yoffset=0).perform()  # 鼠标移动到距离当前位置(x,y)
        # ActionChains(driver).move_by_offset(xoffset=2, yoffset=0).perform()  # 鼠标移动到距离当前位置(x,y)
        print('第三步,释放鼠标')
        ActionChains(driver).release(on_element=element).perform()
        time.sleep(1)
    
    
    def main_check_slider(driver):
        """
        检查滑动按钮是否加载
        :param driver:
        :return:
        """
        while True:
            try:
                driver.get('https://www.huxiu.com/')
                element = WebDriverWait(driver, 30, 0.5).until(EC.element_to_be_clickable((By.CLASS_NAME, 'js-login')))
                if element:
                    return element
            except TimeoutException as e:
                print('超时错误,继续')
                time.sleep(5)
    
    
    if __name__ == '__main__':
    
        try:
            count = 3  # 最多识别3次
            driver = webdriver.Chrome()
            while count > 0:
                # 等待滑动按钮加载完成
                element = main_check_slider(driver)
                main_check_code(driver, element)
                try:
                    success_element = (By.CSS_SELECTOR, '.gt_success')
                    # 得到成功标志
                    success_images = WebDriverWait(driver, 3).until(EC.presence_of_element_located(success_element))
                    if success_images:
                        print('成功识别!!!!!!')
                        count = 0
                        import sys
    
                        sys.exit()
                except Exception as e:
                    print('识别错误,继续')
                    count -= 1
                    time.sleep(1)
            else:
                print('too many attempt check code ')
                exit('退出程序')
        finally:
            driver.close()

    具体介绍参考祥哥博客

     也可参考CSDN链接

  • 相关阅读:
    Study Plan The Twelfth Day
    Study Plan The Fifteenth Day
    Study Plan The Seventeenth Day
    Study Plan The Tenth Day
    Study Plan The Eighth Day
    Study Plan The Eleventh Day
    Study Plan The Sixteenth Day
    Study Plan The Thirteenth Day
    Study Plan The Fourteenth Day
    Study Plan The Ninth Day
  • 原文地址:https://www.cnblogs.com/LearningOnline/p/9727126.html
Copyright © 2011-2022 走看看