zoukankan      html  css  js  c++  java
  • python爬虫知识点总结(八)Selenium库详解

     官方学习文档:http://selenium-python.readthedocs.io/api.html

    一、什么是Selenium?

    答:自动化测试工具,支持多种浏览器。用来驱动浏览器,发出指令让浏览器做出各种动作,如下拉,跳转等。

      爬虫中主要用来解决JavaScript渲染的问题。

    注:如果用requests,urllib这些库无法正常获取网页内容,可以用Selenium来完成渲染

    二、安装

    pip3 install selenium

    三、基本使用

    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.wait import WebDriverWait
    
    browser = webdriver.Chrome()
    try:
        browser.get('http://www.baidu.com')
        input = browser.find_element_by_id('kw')
        input.send_keys('Python')
        input.send_keys(Keys.ENTER)
        wait = WebDriverWait(browser,50)
        wait.until(EC.presence_of_element_located((By.ID,'content_left')))
        print(browser.current_url) 
        print(browser.get_cookies())
        print(browser.page_source)
    finally:
        browser.close()
    

      

    声明浏览器对象

    from selenium import webdriver
    
    browser = webdriver.Chrome()
    browser = webdriver.Firefox()
    browser = webdriver.Edge()
    browser = webdriver.PhantomJS()
    browser = webdriver.Safari()
    

      

    访问页面

    from selenium import webdriver
    
    browser = webdriver.Chrome()
    browser.get('https://www.taobao.com')
    print(browser.page_source)
    browser.close()
    

      

    四、查找元素

    单个元素

    from selenium import webdriver
    
    browser = webdriver.Chrome()
    browser.get('https://www.taobao.com')
    input_first = browser.find_element_by_id('q')
    input_second = browser.find_element_by_css_selector('#q')
    input_third = browser.find_element_by_xpath('//*[@id="q"]')
    print(input_first,input_second,input_third)
    browser.close()
    

      

    其他方法:

    • fid_element_by_name
    • fid_element_by_xpath
    • fid_element_by_link_text
    • fid_element_by_partial_link_text
    • fid_element_by_tag_name
    • fid_element_by_class_name
    • fid_element_by_css_selector
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    
    browser = webdriver.Chrome()
    browser.get('https://www.taobao.com')
    input_first = browser.find_element(By.ID,'q')
    print(input_first)
    browser.close()
    

      

    多个元素

    from selenium import webdriver
    from selenium.webdriver.common.by import By
    
    browser = webdriver.Chrome()
    browser.get('https://www.taobao.com')
    lis = browser.find_elements_by_css_selector('.service-bd li')
    print(lis)
    browser.close()
    

      

    from selenium import webdriver
    from selenium.webdriver.common.by import By
    
    browser = webdriver.Chrome()
    browser.get('https://www.taobao.com')
    lis = browser.find_elements(By.CSS_SELECTOR,'.service-bd li')
    print(lis)
    browser.close()
    

      

    其他方法:

    • fid_elements_by_name
    • fid_elements_by_xpath
    • fid_elements_by_link_text
    • fid_elements_by_partial_link_text
    • fid_elements_by_tag_name
    • fid_elements_by_class_name
    • fid_elements_by_css_selector

    五、元素交互操作

    对获取的元素调用交互方法

    from selenium import webdriver
    import time
    
    browser = webdriver.Chrome()
    browser.get('https://www.taobao.com')
    input = browser.find_element_by_id('q')
    input.send_keys('iPhone')
    time.sleep(1)
    input.clear()
    input.send_keys('iPad')# 发送keys,即搜索内容
    button = browser.find_element_by_class_name('btn-search')
    button.click()
    

      

    更多操作:http://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.remote.webelement

    交互动作

    将动作附加到动作链中串行执行

    from selenium import webdriver
    from selenium.webdriver import ActionChains
    
    browser = webdriver.Chrome()
    url = 'http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'
    browser.get(url)
    browser.switch_to_frame('iframeResult')
    source = browser.find_element_by_css_selector('#draggable')
    target = browser.find_element_by_css_selector('#droppable')
    actions = ActionChains(browser)
    actions.drag_and_drop(source,target)
    actions.perform()
    

      

    更多操作:http://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.common.action_chains

    六、执行JavaScript

    from selenium import webdriver
    
    browser = webdriver.Chrome()
    browser.get('https://www.zhihu.com/explore')
    browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    browser.execute_script('alert("To Buttom")')
    

      

    七、获取元素信息

    获取属性

    from selenium import webdriver
    from selenium.webdriver import ActionChains
    
    browser = webdriver.Chrome()
    url = 'http://www.zhihu.com/explore'
    browser.get(url)
    logo = browser.find_element_by_id('zh-top-link-logo')
    print(logo)
    print(logo.get_attribute('class'))
    

      

    获取文本值

    from selenium import webdriver
    
    browser = webdriver.Chrome()
    url = 'https://www.zhihu.com/explore'
    browser.get(url)
    input = browser.find_element_by_class_name('zu-top-add-question')
    print(input.text)
    

      

    获取ID、位置、标签名、大小

    from selenium import webdriver
    
    browser = webdriver.Chrome()
    url = 'https://www.zhihu.com/explore'
    browser.get(url)
    input = browser.find_element_by_class_name('zu-top-add-question')
    print(input.id)
    print(input.location)
    print(input.tag_name)
    print(input.size)
    

      

    八、Frame

    import time
    from selenium import webdriver
    from selenium.common.exceptions import NoSuchElementException
    
    browser = webdriver.Chrome()
    url = 'http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'
    browser.get(url)
    browser.switch_to_frame('iframeResult')
    source = browser.find_element_by_css_selector('#draggable')
    print(source)
    try:
        logo = browser.find_element_by_class_name('logo')
        print(logo.text)
    except NoSuchElementException:
        print('NO LOGO')
    browser.switch_to.parent_frame()
    logo = browser.find_element_by_class_name('logo')
    print(logo)
    print(logo.text)
    

      

    九、等待

    隐式等待

    当使用了隐式等待执行测试的时候,如果WebDriver没有在DOM中找到元素,将继续等待,超过设定事件后则抛出找不到元素的异常,换句话说,当查找元素或元素并没有立即出现的时候,隐式等待将等待一段时间再查找DOM,默认时间是0

    from selenium import webdriver
    
    url='https://www.zhihu.com/explore'
    browser = webdriver.Chrome()
    browser.implicitly_wait(10)
    browser.get(url)
    input = browser.find_element_by_class_name('zu-top-add-question')
    print(input)
    

      

    显式等待

    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    
    url = 'https://www.taobao.com'
    browser = webdriver.Chrome()
    browser.get(url)
    wait = WebDriverWait(browser,10)
    
    input = wait.until(EC.presence_of_element_located((By.ID,'q')))
    button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'.bin-search')))
    print(input)
    print(button)
    

      

    其他等待条件

    * title_is 标题是某内容
    * title_contains 标题包含某内容
    * presence_of_element_located 元素加载出,传入定位元组,如(By.ID,'p')
    * visiblility_of_element_located 元素可见,传入定位元组
    * visibility_of 可见,传入元素对象
    * presence_of_all_elements_located 所有元素加载出
    * text_to_be_present_in_element 某个元素文本包含某文字
    * text_to_be_present_in_element_value 某个元素值包含某文字
    * frame_to_be_available_and_switch_to_it frame加载并切换
    * invisibility_of_element_located 元素不可见
    * element_to_be_clickable 元素可点击
    * staleness_of 判断一个元素是否仍在DOM,可判断页面是否已经刷新
    * element_to_be_selected 元素可选择,传元素对象
    * element_located_to_be_selected 元素可选择,传入定位元组
    * element_selection_state_to_be 传入元素对象以及状态,相等返回True,否则返回False
    * element_located_selection_state_to_be 传入定位元组以及状态,相等返回True,否则返回False
    * alert_if_present 是否出现Alert

    详细内容:http://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.support.excepted_conditions

    十、前进后退

    import time
    from selenium import webdriver
    
    browser = webdriver.Chrome()
    browser.get('https://www.baidu.com/')
    browser.get('https://www.taobao.com/')
    browser.get('https://www.zhihu.com')
    browser.back()
    time.sleep(1)
    browser.forward()
    browser.close()
    

      

    十一、Cookies

    from selenium import webdriver
    
    browser = webdriver.Chrome()
    browser.get('https://www.zhihu.com/explore')
    print(browser.get_cookies())
    print('
    ')
    
    browser.add_cookie({'name':'name','domain':'www.zhihu.com','value':'jack'})
    print(browser.get_cookies())
    print('
    ')
    browser.delete_all_cookies()
    print(browser.get_cookies())
    

      

    十二、选项卡管理

    import time
    from selenium import webdriver
    
    browser = webdriver.Chrome()
    browser.get('https://www.baidu.com')
    browser.execute_script('window.open()') #js方式window.open打开一个新的选项卡
    print(browser.window_handles)
    browser.switch_to_window(browser.window_handles[1])# 切换到第一个选项卡
    browser.get('https://www.taobao.com')
    time.sleep(1)
    browser.switch_to_window(browser.window_handles[0])# 切换到第二个选项卡
    browser.get('https://python.org')
    

      

    十三、异常处理

    from selenium import webdriver
    
    browser = webdriver.Chrome()
    browser.get('https://www.baidu.com')
    browser.find_element_by_id('hello') # 查找一个不存的id,回报出异常
    

      

    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException,NoSuchElementException
    
    browser = webdriver.Chrome()
    try:
        browser.get('https://www.baidu.com')
        print('ok')
    except TimeoutException:
        print('Time out')
    

      

    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException,NoSuchElementException
    
    browser = webdriver.Chrome()
    try:
        browser.get('https://www.baidu.com')
        print('ok')
    except TimeoutException:
        print('Time out')
    try:
        browser.get('https://www.baidu.com')
    except:
        print('No Element')
    finally:
        browser.close()
    

      

    异常处理详细文档:http://selenium-python.readthedocs.io/api.html#module-selenium.common.exceptions

  • 相关阅读:
    bzoj 1176 cdq分治套树状数组
    Codeforces 669E cdq分治
    Codeforces 1101D 点分治
    Codeforces 1100E 拓扑排序
    Codeforces 1188D Make Equal DP
    Codeforces 1188A 构造
    Codeforces 1188B 式子转化
    Codeforces 1188C DP 鸽巢原理
    Codeforces 1179D 树形DP 斜率优化
    git commit -m "XX"报错 pre -commit hook failed (add --no-verify to bypass)问题
  • 原文地址:https://www.cnblogs.com/cthon/p/9410796.html
Copyright © 2011-2022 走看看