zoukankan      html  css  js  c++  java
  • python_06

    今日内容:
    注意: selenium驱动的浏览器是干净的,没有任何缓存。

    1、selenium剩余用法
    2、selenium万能登录破解
    3、selenium爬取京东商品信息
    4、破解极验滑动验证码

    Xpath语法:

    今日作业:
    1、总结课堂知识点,写博客
    2、爬取京东商品信息
    3、滑动验证(提高题)

    一、自动登录抽屉新热榜

    from selenium import webdriver
    import time
    
    driver = webdriver.Chrome(r'D:BaiduNetdiskDownloadchromedriver_win32chromedriver.exe')
    
    # 把窗口转成全屏
    driver.maximize_window()
    
    try:
        driver.get('https://dig.chouti.com/')
        driver.implicitly_wait(10)
        time.sleep(5)
    
        # 1、点击登录
        login_btn = driver.find_element_by_id('login_btn')
        login_btn.click()
        time.sleep(2)
    
        # 2、输入用户名
        phone = driver.find_element_by_class_name('login-phone')
        phone.send_keys('15622792660')
    
        # 3、输入密码
        pwd = driver.find_element_by_class_name('pwd-password-input')
        pwd.send_keys('kermit46709394')
    
        # 4、确认登录
        login_submit = driver.find_element_by_class_name('btn-large')
        login_submit.click()
    
        time.sleep(20)
    
    # 捕获异常并打印
    except Exception as e:
        print(e)
    
    finally:
        driver.close()

    二、 selenium选择器之Xpath

    from selenium import webdriver
    
    driver = webdriver.Chrome(r'D:BaiduNetdiskDownloadchromedriver_win32chromedriver.exe')
    
    
    try:
        # 隐式等待: 写在get请求前
        driver.implicitly_wait(5)
    
        driver.get('https://doc.scrapy.org/en/latest/_static/selectors-sample1.html')
    
        # 显式等待: 写在get请求后
        # wait.until(...)
    
        '''
        
    <html>
     <head>
      <base href='http://example.com/' />
      <title>Example website</title>
     </head>
     <body>
      <div id='images'>
       <a href='image1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
       <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
       <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
       <a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>
       <a href='image5.html'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
      </div>
     </body>
    </html>
        '''
        # 根据xpath语法查找元素
        # / 从根节点开始找第一个
        html = driver.find_element_by_xpath('/html')
        # html = driver.find_element_by_xpath('/head')  # 报错
        print(html.tag_name)
    
        # // 从根节点开始找任意一个节点
        div = driver.find_element_by_xpath('//div')
        print(div.tag_name)
    
        # @
        # 查找id为images的div节点
        div = driver.find_element_by_xpath('//div[@id="images"]')
        print(div.tag_name)
        print(div.text)
    
        # 找到第一个a节点
        a = driver.find_element_by_xpath('//a')
        print(a.tag_name)
    
        # 找到所有a节点
        a_s = driver.find_elements_by_xpath('//a')
        print(a_s)
    
        # 找到第一个a节点的href属性
        # get_attribute:获取节点中某个属性
        a = driver.find_element_by_xpath('//a').get_attribute('href')
        print(a)
    
    finally:
        driver.close()

    三、selenium剩余操作

    ''''''
    '''
    点击、清除操作
    '''
    # from selenium import webdriver
    # from selenium.webdriver.common.keys import Keys
    # import time
    #
    # driver = webdriver.Chrome(r'D:BaiduNetdiskDownloadchromedriver_win32chromedriver.exe')
    #
    # try:
    #     driver.implicitly_wait(10)
    #     # 1、往jd发送请求
    #     driver.get('https://www.jd.com/')
    #     # 找到输入框输入围城
    #     input_tag = driver.find_element_by_id('key')
    #     input_tag.send_keys('围城')
    #     # 键盘回车
    #     input_tag.send_keys(Keys.ENTER)
    #     time.sleep(2)
    #     # 找到输入框输入墨菲定律
    #     input_tag = driver.find_element_by_id('key')
    #     input_tag.clear()
    #     input_tag.send_keys('墨菲定律')
    #     # 找到搜索按钮点击搜索
    #     button = driver.find_element_by_class_name('button')
    #     button.click()
    #     time.sleep(10)
    #
    # finally:
    #     driver.close()
    
    
    '''
    获取cookies  (了解)
    '''
    # from selenium import webdriver
    # import time
    #
    # driver = webdriver.Chrome(r'D:BaiduNetdiskDownloadchromedriver_win32chromedriver.exe')
    #
    # try:
    #     driver.implicitly_wait(10)
    #     driver.get('https://www.zhihu.com/explore')
    #     print(driver.get_cookies())
    #
    #     time.sleep(10)
    # finally:
    #     driver.close()
    
    '''
    选项卡
    '''
    #选项卡管理:切换选项卡,有js的方式windows.open,有windows快捷键:
    # ctrl+t等,最通用的就是js的方式
    # import time
    # from selenium import webdriver
    #
    # browser = webdriver.Chrome()
    # try:
    #     browser.get('https://www.baidu.com')
    #
    #     # execute_script: 执行javascrpit代码
    #     # 弹窗操作
    #     # browser.execute_script('alert("tank")')
    #     # 新建浏览器窗口
    #     browser.execute_script(
    #         '''
    #         window.open();
    #         '''
    #     )
    #     time.sleep(1)
    #     print(browser.window_handles)  # 获取所有的选项卡
    #     # 切换到第二个窗口
    #     # 新:
    #     browser.switch_to.window(browser.window_handles[1])
    #     # 旧:
    #     # browser.switch_to_window(browser.window_handles[1])
    #
    #     # 第二个窗口往淘宝发送请求
    #     browser.get('https://www.taobao.com')
    #     time.sleep(5)
    #
    #     # 切换到第一个窗口
    #     browser.switch_to_window(browser.window_handles[0])
    #     browser.get('https://www.sina.com.cn')
    #
    #     time.sleep(10)
    # finally:
    #     browser.close()
    
    
    '''
    ActionChangs动作链
    '''
    # from selenium import webdriver
    # from selenium.webdriver import ActionChains
    # import time
    #
    # driver = webdriver.Chrome()
    # driver.implicitly_wait(10)
    # driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
    #
    # try:
    #
    #     # driver.switch_to_frame('iframeResult')
    #     # 切换到id为iframeResult的窗口内
    #     driver.switch_to.frame('iframeResult')
    #
    #     # 源位置
    #     draggable = driver.find_element_by_id('draggable')
    #
    #     # 目标位置
    #     droppable = driver.find_element_by_id('droppable')
    #
    #     # 调用ActionChains,必须把驱动对象传进去
    #     # 得到一个动作链对象,复制给一个变量
    #     actions = ActionChains(driver)
    #
    #     # 方式一: 机器人
    #     # 瞬间把源图片位置秒移到目标图片位置
    #     # actions.drag_and_drop(draggable, droppable)  # 编写一个行为
    #     # actions.perform()  # 执行编写好的行为
    #
    #
    #     # 方式二: 模拟人的行为
    #     source = draggable.location['x']
    #     target = droppable.location['x']
    #     print(source, target)
    #
    #     distance = target - source
    #     print(distance)
    #
    #     # perform:每个动作都要调用perform执行
    #
    #     # 点击并摁住源图片
    #     ActionChains(driver).click_and_hold(draggable).perform()
    #
    #     s = 0
    #     while s < distance:
    #         # 执行位移操作
    #         ActionChains(driver).move_by_offset(xoffset=2, yoffset=0).perform()
    #         s += 2
    #
    #     # 释放动作链
    #     ActionChains(driver).release().perform()
    #
    #     time.sleep(10)
    #
    #
    # finally:
    #     driver.close()
    
    
    '''
    前进、后退
    '''
    # from selenium import webdriver
    # import time
    #
    # driver = webdriver.Chrome()
    #
    # try:
    #     driver.implicitly_wait(10)
    #     driver.get('https://www.jd.com/')
    #     driver.get('https://www.baidu.com/')
    #     driver.get('https://www.cnblogs.com/')
    #
    #     time.sleep(2)
    #
    #     # 回退操作
    #     driver.back()
    #     time.sleep(1)
    #     # 前进操作
    #     driver.forward()
    #     time.sleep(1)
    #     driver.back()
    #     time.sleep(10)
    #
    # finally:
    #     driver.close()

    四、破解登录

    from selenium import webdriver
    from selenium.webdriver import ChromeOptions
    import time
    r'''
    步骤:
        1、打开文件的查看,显示隐藏文件
        2、找到C:UsersadministortraAppDataLocalGoogleChromeUser Data
            删除Default文件
        3、重新打开浏览器,并登陆百度账号
            - 此时会创建一个新的Default缓存文件
        4、添加cookies
        5、关闭谷歌浏览器后执行程序
    '''
    # 获取options对象,参数对象
    options = ChromeOptions()
    
    # 获取cookies保存路径
    # 'C:UsersadministortraAppDataLocalGoogleChromeUser Data'
    profile_directory = r'--user-data-dir=C:UsersadministortraAppDataLocalGoogleChromeUser Data'
    
    # 添加用户信息目录
    options.add_argument(profile_directory)
    
    # 把参数加载到当前驱动中  chrome_options默认参数,用来接收options对象
    driver = webdriver.Chrome(chrome_options=options)
    
    try:
        driver.implicitly_wait(10)
        driver.get('https://www.baidu.com/')
        '''
        BDUSS:*****
        '''
        # 添加用户cookies信息
        # name、value必须小写
        driver.add_cookie({"name": "BDUSS", "value": "用户session字符串"})
    
        # 刷新操作
        driver.refresh()
    
        time.sleep(10)
    
    finally:
        driver.close()

    五、04 selenium爬取京东商品信息

    # ''''''
    # '''
    # 爬取京东商品信息:
    #     请求url:
    #         https://www.jd.com/
    #     提取商品信息:
    #         1.商品详情页
    #         2.商品名称
    #         3.商品价格
    #         4.评价人数
    #         5.商品商家
    # '''
    # from selenium import webdriver
    # from selenium.webdriver.common.keys import Keys
    # import time
    #
    # driver = webdriver.Chrome()
    #
    # try:
    #     driver.implicitly_wait(10)
    #     # 1、往京东主页发送请求
    #     driver.get('https://www.jd.com/')
    #
    #     # 2、输入商品名称,并回车搜索
    #     input_tag = driver.find_element_by_id('key')
    #     input_tag.send_keys('macbook')
    #     input_tag.send_keys(Keys.ENTER)
    #     time.sleep(2)
    #
    #     # 通过JS控制滚轮滑动获取所有商品信息
    #     js_code = '''
    #         window.scrollTo(0,5000);
    #     '''
    #     driver.execute_script(js_code)  # 执行js代码
    #
    #     # 等待数据加载
    #     time.sleep(2)
    #
    #     # 3、查找所有商品div
    #     # good_div = driver.find_element_by_id('J_goodsList')
    #     good_list = driver.find_elements_by_class_name('gl-item')
    #     n = 1
    #     for good in good_list:
    #         # 根据属性选择器查找
    #         # 商品链接
    #         good_url = good.find_element_by_css_selector(
    #             '.p-img a').get_attribute('href')
    #
    #         # 商品名称
    #         good_name = good.find_element_by_css_selector(
    #             '.p-name em').text.replace("
    ", "--")
    #
    #         # 商品价格
    #         good_price = good.find_element_by_class_name(
    #             'p-price').text.replace("
    ", ":")
    #
    #         # 评价人数
    #         good_commit = good.find_element_by_class_name(
    #             'p-commit').text.replace("
    ", " ")
    #
    #         # 商品商家
    #         good_from = good.find_element_by_class_name(
    #             'J_im_icon').text.replace("
    ", " ")
    #
    #         good_content = f'''
    #                     商品链接: {good_url}
    #                     商品名称: {good_name}
    #                     商品价格: {good_price}
    #                     评价人数: {good_commit}
    #                     商品商家: {good_from}
    #                     
    
    #                     '''
    #         print(good_content)
    #         with open('jd.txt', 'a', encoding='utf-8') as f:
    #             f.write(good_content)
    #
    #     next_tag = driver.find_element_by_link_text('下一页')
    #
    #     next_tag.click()
    #
    #     time.sleep(10)
    #
    #
    # finally:
    #     driver.close()
    
    
    
    ''''''
    '''
    爬取京东商品信息:
        请求url:
            https://www.jd.com/
        提取商品信息:
            1.商品详情页
            2.商品名称
            3.商品价格
            4.评价人数
            5.商品商家
    '''
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    import time
    
    
    def get_good(driver):
        try:
    
            # 通过JS控制滚轮滑动获取所有商品信息
            js_code = '''
                window.scrollTo(0,5000);
            '''
            driver.execute_script(js_code)  # 执行js代码
    
            # 等待数据加载
            time.sleep(2)
    
            # 3、查找所有商品div
            # good_div = driver.find_element_by_id('J_goodsList')
            good_list = driver.find_elements_by_class_name('gl-item')
            n = 1
            for good in good_list:
                # 根据属性选择器查找
                # 商品链接
                good_url = good.find_element_by_css_selector(
                    '.p-img a').get_attribute('href')
    
                # 商品名称
                good_name = good.find_element_by_css_selector(
                    '.p-name em').text.replace("
    ", "--")
    
                # 商品价格
                good_price = good.find_element_by_class_name(
                    'p-price').text.replace("
    ", ":")
    
                # 评价人数
                good_commit = good.find_element_by_class_name(
                    'p-commit').text.replace("
    ", " ")
    
                good_content = f'''
                            商品链接: {good_url}
                            商品名称: {good_name}
                            商品价格: {good_price}
                            评价人数: {good_commit}
                            
    
                            '''
                print(good_content)
                with open('jd.txt', 'a', encoding='utf-8') as f:
                    f.write(good_content)
    
            next_tag = driver.find_element_by_class_name('pn-next')
            next_tag.click()
    
            time.sleep(2)
    
            # 递归调用函数
            get_good(driver)
    
            time.sleep(10)
    
        finally:
            driver.close()
    
    
    if __name__ == '__main__':
    
        good_name = input('请输入爬取商品信息:').strip()
    
        driver = webdriver.Chrome()
        driver.implicitly_wait(10)
        # 1、往京东主页发送请求
        driver.get('https://www.jd.com/')
    
        # 2、输入商品名称,并回车搜索
        input_tag = driver.find_element_by_id('key')
        input_tag.send_keys(good_name)
        input_tag.send_keys(Keys.ENTER)
        time.sleep(2)
    
        get_good(driver)

    作业:破解极限滑动验证

    ''''''
    '''
    破解极验滑动验证
    博客园登录url:
        https://account.cnblogs.com/signin?returnUrl=https%3A%2F%2Fwww.cnblogs.com%2F
    1、输入用户名与密码,并点击登录
    2、弹出滑动验证,获取有缺口与完整的图片
    3、通过像素点进行比对,获取滑动位移距离
    4、模拟人的行为轨迹
    5、开始滑动
    '''
    from selenium import webdriver  # 用来驱动浏览器的
    from selenium.webdriver import ActionChains  # 破解滑动验证码的时候用的 可以拖动图片
    import time
    from PIL import Image  # pip3 install pillow
    import random
    
    option = webdriver.ChromeOptions()
    option.add_argument('disable-infobars')
    
    driver = webdriver.Chrome(chrome_options=option)
    
    
    def get_snap(driver):
        # selenium自带的截图网页全屏图片
        driver.save_screenshot('snap.png')
    
        img = driver.find_element_by_class_name('geetest_canvas_img')
    
        left = img.location['x']
    
        upper = img.location['y']
    
        right = left + img.size['width']
        lower = upper + img.size['height']
    
        # print(left, upper, right, lower)
        img_obj = Image.open('snap.png')
    
        # 对屏幕进行截取,获取滑动验证图片
        image = img_obj.crop((left, upper, right, lower))
    
        return image
    
    
    def get_image1(driver):
        time.sleep(0.2)
        js_code = '''
        var x = document.getElementsByClassName('geetest_canvas_fullbg')[0].style.display="block";
        console.log(x)
        '''
    
        time.sleep(1)
        driver.execute_script(js_code)
    
        # 截取图片
        img_obj = get_snap(driver)
    
        return img_obj
    
    
    def get_image2(driver):
        time.sleep(0.2)
    
        js_code = '''
        var x = document.getElementsByClassName('geetest_canvas_fullbg')[0].style.display="none";
        console.log(x)
        '''
    
        driver.execute_script(js_code)
    
        time.sleep(1)
    
        # 截取图片
        img_obj = get_snap(driver)
    
        return img_obj
    
    
    def get_distance(image1, image2):
        # 初始值
        start = 60
    
        # 滑块色差
        color_num = 60
    
        for x in range(start, image1.size[0]):
            for y in range(image1.size[1]):
    
                rgb1 = image1.load()[x, y]
    
                rgb2 = image2.load()[x, y]
    
                r = abs(rgb1[0] - rgb2[0])
                g = abs(rgb1[1] - rgb2[1])
                b = abs(rgb1[2] - rgb2[2])
    
                if not (r < color_num and g < color_num and b < color_num):
                    return x - 7
    
    
    def get_stacks(distance):
        distance += 20
    
        '''
        匀加速减速运行
            v = v0 + a * t
    
        位移:
        s = v * t + 0.5 * a * (t**2)
        '''
    
        # 初速度
        v0 = 0
    
        # 加减速度列表
        a_list = [3, 4, 5]
    
        # 时间
        t = 0.2
    
        # 初始位置
        s = 0
    
        # 向前滑动轨迹
        forward_stacks = []
    
        mid = distance * 3 / 5
    
        while s < distance:
            if s < mid:
                a = a_list[random.randint(0, 2)]
    
            else:
                a = -a_list[random.randint(0, 2)]
    
            v = v0
    
            stack = v * t + 0.5 * a * (t ** 2)
    
            # 每次拿到的位移
            stack = round(stack)
    
            s += stack
    
            v0 = v + a * t
    
            forward_stacks.append(stack)
    
        back_stacks = [-1, -1, -2, -3, -2, -3, -2, -2, -3, -1]
    
        return {'forward_stacks': forward_stacks, 'back_stacks': back_stacks}
    
    
    def main():
        try:
    
            driver.get('https://passport.cnblogs.com/user/signin')
            driver.implicitly_wait(5)
    
            # 1.输入用户名与密码,点击登录
            username = driver.find_element_by_id('LoginName')
            password = driver.find_element_by_id('Password')
            login_button = driver.find_element_by_class_name('ladda-label')
            time.sleep(1)
            username.send_keys('_tank_')
            time.sleep(1)
            password.send_keys('k46709394.')
    
            # 这里需要等待账号密码输入完毕后再点击登录按钮,否则的不弹框
            time.sleep(1)
            login_button.click()
            # time.sleep(3)
    
            # 2.点击滑动验证按钮,获取图片
            geetest_button = driver.find_element_by_class_name('geetest_slider_button')
            geetest_button.click()
    
            time.sleep(0.2)
    
            # 3.针对完整的图片进行截取
            image1 = get_image1(driver)
    
            # 4.针对有缺口的图片进行截取
            image2 = get_image2(driver)
    
            # 5.对比两张图片,获取滑动距离
            distance = get_distance(image1, image2)
    
            # 6.模拟人为滑动轨迹
            stacks = get_stacks(distance)
    
            # 7.根据滑动轨迹进行滑动
            forward_stacks = stacks['forward_stacks']
            back_stacks = stacks['back_stacks']
    
            slider_button = driver.find_element_by_class_name('geetest_slider_button')
            time.sleep(0.2)
    
            ActionChains(driver).click_and_hold(slider_button).perform()
    
            time.sleep(0.2)
            for forward_stack in forward_stacks:
                ActionChains(driver).move_by_offset(xoffset=forward_stack, yoffset=0).perform()
                time.sleep(0.1)
            for back_stack in back_stacks:
                ActionChains(driver).move_by_offset(xoffset=back_stack, yoffset=0).perform()
                time.sleep(0.1)
    
            time.sleep(0.2)
    
            ActionChains(driver).move_by_offset(xoffset=5, yoffset=0).perform()
            ActionChains(driver).move_by_offset(xoffset=-5, yoffset=0).perform()
    
            ActionChains(driver).release().perform()
    
            time.sleep(50)
    
    
        finally:
            driver.close()
    
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    python正则表达式re模块
    链表算法题之中等级别,debug调试更简单
    链表算法题二,还原题目,用debug调试搞懂每一道题
    开启算法之路,还原题目,用debug调试搞懂每一道题
    K8S线上集群排查,实测排查Node节点NotReady异常状态
    手写单链表基础之增,删,查!附赠一道链表题
    kafka初识
    docker之mysql镜像使用
    CS61B sp2018笔记 | Lists
    JSONArray.fromObject不执行且不报错问题的解决
  • 原文地址:https://www.cnblogs.com/zhanglei97/p/11049438.html
Copyright © 2011-2022 走看看