zoukankan      html  css  js  c++  java
  • python学习之Xpath、selenium其他用法、获取cookies登录验证、

    1.Xpath

    from selenium import webdriver
    
    driver = webdriver.Chrome(r'D:BaiduNetdiskDownloadchromedriver_win32chromedriver.exe')
    
    
    try:
        # 隐式等待: 写在get请求前
        driver.implicitly_wait(5)
    
        driver.get('https://doc.scrapy.org/en/latest/_static/selectors-sample1.html')
    
        # 显式等待: 写在get请求后
        # wait.until(...)
    
        '''
        
    <html>
     <head>
      <base href='http://example.com/' />
      <title>Example website</title>
     </head>
     <body>
      <div id='images'>
       <a href='image1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
       <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
       <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
       <a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>
       <a href='image5.html'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
      </div>
     </body>
    </html>
        '''
        # 根据xpath语法查找元素
        # / 从根节点开始找第一个
        html = driver.find_element_by_xpath('/html')
        # html = driver.find_element_by_xpath('/head')  # 报错
        print(html.tag_name)
    
        # // 从根节点开始找任意一个节点
        div = driver.find_element_by_xpath('//div')
        print(div.tag_name)
    
        # @
        # 查找id为images的div节点
        div = driver.find_element_by_xpath('//div[@id="images"]')
        print(div.tag_name)
        print(div.text)
    
        # 找到第一个a节点
        a = driver.find_element_by_xpath('//a')
        print(a.tag_name)
    
        # 找到所有a节点
        a_s = driver.find_elements_by_xpath('//a')
        print(a_s)
    
        # 找到第一个a节点的href属性
        # get_attribute:获取节点中某个属性
        a = driver.find_element_by_xpath('//a').get_attribute('href')
        print(a)
    
    finally:
        driver.close()

    2.

    ''''''
    '''
    点击、清除操作
    '''
    # from selenium import webdriver
    # from selenium.webdriver.common.keys import Keys
    # import time
    #
    # driver = webdriver.Chrome(r'D:BaiduNetdiskDownloadchromedriver_win32chromedriver.exe')
    #
    # try:
    #     driver.implicitly_wait(10)
    #     # 1、往jd发送请求
    #     driver.get('https://www.jd.com/')
    #     # 找到输入框输入围城
    #     input_tag = driver.find_element_by_id('key')
    #     input_tag.send_keys('围城')
    #     # 键盘回车
    #     input_tag.send_keys(Keys.ENTER)
    #     time.sleep(2)
    #     # 找到输入框输入墨菲定律
    #     input_tag = driver.find_element_by_id('key')
    #     input_tag.clear()
    #     input_tag.send_keys('墨菲定律')
    #     # 找到搜索按钮点击搜索
    #     button = driver.find_element_by_class_name('button')
    #     button.click()
    #     time.sleep(10)
    #
    # finally:
    #     driver.close()
    
    
    '''
    获取cookies  (了解)
    '''
    # from selenium import webdriver
    # import time
    #
    # driver = webdriver.Chrome(r'D:BaiduNetdiskDownloadchromedriver_win32chromedriver.exe')
    #
    # try:
    #     driver.implicitly_wait(10)
    #     driver.get('https://www.zhihu.com/explore')
    #     print(driver.get_cookies())
    #
    #     time.sleep(10)
    # finally:
    #     driver.close()
    
    '''
    选项卡
    '''
    #选项卡管理:切换选项卡,有js的方式windows.open,有windows快捷键:
    # ctrl+t等,最通用的就是js的方式
    # import time
    # from selenium import webdriver
    #
    # browser = webdriver.Chrome()
    # try:
    #     browser.get('https://www.baidu.com')
    #
    #     # execute_script: 执行javascrpit代码
    #     # 弹窗操作
    #     # browser.execute_script('alert("tank")')
    #     # 新建浏览器窗口
    #     browser.execute_script(
    #         '''
    #         window.open();
    #         '''
    #     )
    #     time.sleep(1)
    #     print(browser.window_handles)  # 获取所有的选项卡
    #     # 切换到第二个窗口
    #     # 新:
    #     browser.switch_to.window(browser.window_handles[1])
    #     # 旧:
    #     # browser.switch_to_window(browser.window_handles[1])
    #
    #     # 第二个窗口往淘宝发送请求
    #     browser.get('https://www.taobao.com')
    #     time.sleep(5)
    #
    #     # 切换到第一个窗口
    #     browser.switch_to_window(browser.window_handles[0])
    #     browser.get('https://www.sina.com.cn')
    #
    #     time.sleep(10)
    # finally:
    #     browser.close()
    
    
    '''
    ActionChangs动作链
    '''
    # from selenium import webdriver
    # from selenium.webdriver import ActionChains
    # import time
    #
    # driver = webdriver.Chrome()
    # driver.implicitly_wait(10)
    # driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
    #
    # try:
    #
    #     # driver.switch_to_frame('iframeResult')
    #     # 切换到id为iframeResult的窗口内
    #     driver.switch_to.frame('iframeResult')
    #
    #     # 源位置
    #     draggable = driver.find_element_by_id('draggable')
    #
    #     # 目标位置
    #     droppable = driver.find_element_by_id('droppable')
    #
    #     # 调用ActionChains,必须把驱动对象传进去
    #     # 得到一个动作链对象,复制给一个变量
    #     actions = ActionChains(driver)
    #
    #     # 方式一: 机器人
    #     # 瞬间把源图片位置秒移到目标图片位置
    #     # actions.drag_and_drop(draggable, droppable)  # 编写一个行为
    #     # actions.perform()  # 执行编写好的行为
    #
    #
    #     # 方式二: 模拟人的行为
    #     source = draggable.location['x']
    #     target = droppable.location['x']
    #     print(source, target)
    #
    #     distance = target - source
    #     print(distance)
    #
    #     # perform:每个动作都要调用perform执行
    #
    #     # 点击并摁住源图片
    #     ActionChains(driver).click_and_hold(draggable).perform()
    #
    #     s = 0
    #     while s < distance:
    #         # 执行位移操作
    #         ActionChains(driver).move_by_offset(xoffset=2, yoffset=0).perform()
    #         s += 2
    #
    #     # 释放动作链
    #     ActionChains(driver).release().perform()
    #
    #     time.sleep(10)
    #
    #
    # finally:
    #     driver.close()
    
    
    '''
    前进、后退
    '''
    # from selenium import webdriver
    # import time
    #
    # driver = webdriver.Chrome()
    #
    # try:
    #     driver.implicitly_wait(10)
    #     driver.get('https://www.jd.com/')
    #     driver.get('https://www.baidu.com/')
    #     driver.get('https://www.cnblogs.com/')
    #
    #     time.sleep(2)
    #
    #     # 回退操作
    #     driver.back()
    #     time.sleep(1)
    #     # 前进操作
    #     driver.forward()
    #     time.sleep(1)
    #     driver.back()
    #     time.sleep(10)
    #
    # finally:
    #     driver.close()

    3.破解登录

    from selenium import webdriver
    from selenium.webdriver import ChromeOptions
    import time
    r'''
    步骤:
        1、打开文件的查看,显示隐藏文件
        2、找到C:UsersadministortraAppDataLocalGoogleChromeUser Data
            删除Default文件
        3、重新打开浏览器,并登陆百度账号
            - 此时会创建一个新的Default缓存文件
        4、添加cookies
        5、关闭谷歌浏览器后执行程序
    '''
    # 获取options对象,参数对象
    options = ChromeOptions()
    
    # 获取cookies保存路径
    # 'C:UsersadministortraAppDataLocalGoogleChromeUser Data'
    profile_directory = r'--user-data-dir=C:UsersadministortraAppDataLocalGoogleChromeUser Data'
    
    # 添加用户信息目录
    options.add_argument(profile_directory)
    
    # 把参数加载到当前驱动中  chrome_options默认参数,用来接收options对象
    driver = webdriver.Chrome(chrome_options=options)
    
    try:
        driver.implicitly_wait(10)
        driver.get('https://www.baidu.com/')
        '''
        BDUSS:*****
        '''
        # 添加用户cookies信息
        # name、value必须小写
        driver.add_cookie({"name": "BDUSS", "value": "用户session字符串"})
    
        # 刷新操作
        driver.refresh()
    
        time.sleep(10)
    
    finally:
        driver.close()

    4.破解滑动验证码

    from selenium import webdriver
    from selenium.webdriver import ActionChains
    import time
    
    driver = webdriver.Chrome()
    driver.implicitly_wait(10)
    driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
    
    try:
    
        # driver.switch_to_frame('iframeResult')
        # 切换到id为iframeResult的窗口内
        driver.switch_to.frame('iframeResult')
    
        # 源位置
        draggable = driver.find_element_by_id('draggable')
    
        # 目标位置
        droppable = driver.find_element_by_id('droppable')
    
        # 调用ActionChains,必须把驱动对象传进去
        # 得到一个动作链对象,复制给一个变量
        actions = ActionChains(driver)
    
        # 方式一: 机器人
        # 瞬间把源图片位置秒移到目标图片位置
        # actions.drag_and_drop(draggable, droppable)  # 编写一个行为
        # actions.perform()  # 执行编写好的行为
    
    
        # 方式二: 模拟人的行为
        source = draggable.location['x']
        target = droppable.location['x']
        print(source, target)
    
        distance = target - source
        print(distance)
    
        # perform:每个动作都要调用perform执行
    
        # 点击并摁住源图片
        ActionChains(driver).click_and_hold(draggable).perform()
    
        s = 0
        while s < distance:
            # 执行位移操作
            ActionChains(driver).move_by_offset(xoffset=2, yoffset=0).perform()
            s += 2
    
        # 释放动作链
        ActionChains(driver).release().perform()
    
        time.sleep(10)
    
    
    finally:
        driver.close()

    5.爬取京东商品信息

    '''
    爬取京东商品信息:
        请求url:
            https://www.jd.com/
        提取商品信息:
            1.商品详情页
            2.商品名称
            3.商品价格
            4.评价人数
            5.商品商家
    '''
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    import time
    
    
    def get_good(driver):
        try:
    
            # 通过JS控制滚轮滑动获取所有商品信息
            js_code = '''
                window.scrollTo(0,5000);
            '''
            driver.execute_script(js_code)  # 执行js代码
    
            # 等待数据加载
            time.sleep(2)
    
            # 3、查找所有商品div
            # good_div = driver.find_element_by_id('J_goodsList')
            good_list = driver.find_elements_by_class_name('gl-item')
            n = 1
            for good in good_list:
                # 根据属性选择器查找
                # 商品链接
                good_url = good.find_element_by_css_selector(
                    '.p-img a').get_attribute('href')
    
                # 商品名称
                good_name = good.find_element_by_css_selector(
                    '.p-name em').text.replace("
    ", "--")
    
                # 商品价格
                good_price = good.find_element_by_class_name(
                    'p-price').text.replace("
    ", ":")
    
                # 评价人数
                good_commit = good.find_element_by_class_name(
                    'p-commit').text.replace("
    ", " ")
    
                good_content = f'''
                            商品链接: {good_url}
                            商品名称: {good_name}
                            商品价格: {good_price}
                            评价人数: {good_commit}
                            
    
                            '''
                print(good_content)
                with open('jd.txt', 'a', encoding='utf-8') as f:
                    f.write(good_content)
    
            next_tag = driver.find_element_by_class_name('pn-next')
            next_tag.click()
    
            time.sleep(2)
    
            # 递归调用函数
            get_good(driver)
    
            time.sleep(10)
    
        finally:
            driver.close()
    
    
    if __name__ == '__main__':
    
        good_name = input('请输入爬取商品信息:').strip()
    
        driver = webdriver.Chrome(r'F:python学习Scriptschromedriver')
        driver.implicitly_wait(10)
        # 1、往京东主页发送请求
        driver.get('https://www.jd.com/')
    
        # 2、输入商品名称,并回车搜索
        input_tag = driver.find_element_by_id('key')
        input_tag.send_keys(good_name)
        input_tag.send_keys(Keys.ENTER)
        time.sleep(2)
    
        get_good(driver)
  • 相关阅读:
    Python装饰器
    Python常用内建模块
    Python文件的操作
    Python集合的操作
    Python字典的操作
    Python列表元组的操作
    os.path
    Python字符串的操作
    线性回归
    随机森林
  • 原文地址:https://www.cnblogs.com/lhhhha/p/11048680.html
Copyright © 2011-2022 走看看