zoukankan      html  css  js  c++  java
  • selenium爬虫使用

    1. 网页的打开

    from selenium import webdriver
    import time
    
    
    driver = webdriver.Chrome(executable_path=r"C:Usersqq302Desktopchromedriver.exe")
    
    # 1.打开浏览器 
    driver.get('https://www.baidu.com/')

    2.网页浏览器的关闭

    from selenium import webdriver
    import time
    
    driver = webdriver.Chrome(executable_path=r"C:Usersqq302Desktopchromedriver.exe")
    driver.get('https://www.baidu.com/')
    time.sleep(5)
    driver.close() # 关闭当前网页
    driver.quit() # 关闭整个浏览器

    3. 关于内容元素分析

    from selenium import webdriver
    import time
    
    driver = webdriver.Chrome(executable_path=r"C:Usersqq302Desktopchromedriver.exe")
    driver.get('https://www.baidu.com/')
    # 使用id进行查找
    inputTag = driver.find_element_by_id('kw')
    # 使用class_name 进行查找
    inputTag = driver.find_element_by_class_name('s_ipt')
    # 使用xpath 进行查找
    inputTag = driver.find_element_by_xpath("//input[@id='kw']")
    # 使用css解析器进行查找
    inputTag = driver.find_element_by_css_selector('.s_ipt')
    # 我们也可以使用By进行元素的查找
    from selenium.webdriver.common.by import By
    
    inputTag = driver.find_element(By.ID, 'kw')
    inputTag.send_keys('python')
    
    # 如果需要进行文本分析的话
    from lxml import etree
    
    html = etree.HTML(driver.page_source)
    print(html.xpath(''))

    4. 操作表单元素

    from selenium import webdriver
    import time
    
    # 第一种:send_keys() 发送文本
    driver = webdriver.Chrome(executable_path=r"C:Usersqq302Desktopchromedriver.exe")
    driver.get('https://www.baidu.com/')
    inputTag = driver.find_element_by_id('kw')
    inputTag.send_keys('python') # 向kw发送python
    time.sleep(5)
    inputTag.clear() # 进行输入的清除
    
    # 第二种:进行点击操作
    driver = webdriver.Chrome(executable_path=r"C:Usersqq302Desktopchromedriver.exe")
    driver.get('https://www.baidu.com/')
    inputTag = driver.find_element_by_id('kw')
    inputTag.send_keys('python') # 向kw发送python
    time.sleep(5)
    inputTag.click() # 进行输入的清除
    
    # 第三种进行下拉框的操作
    from selenium.webdriver.support.ui import Select
    driver = webdriver.Chrome(executable_path=r"C:Usersqq302Desktopchromedriver.exe")
    driver.get('https://www.baidu.com/')
    SelectBn = Select(driver.find_element_by_class_name('pf'))
    SelectBn.select_by_index(1)
    SelectBn.deselect_all()

    5.行为链模仿鼠标进行移动,点击,双击操作

    from selenium import webdriver
    from selenium.webdriver.common.action_chains import ActionChains
    import time
    
    from selenium.webdriver.support.ui import Select
    driver = webdriver.Chrome(executable_path=r"C:Usersqq302Desktopchromedriver.exe")
    driver.get('https://www.baidu.com/')
    
    inputTag = driver.find_element_by_id('kw')
    submitBn = driver.find_element_by_id('su')
    
    actions = ActionChains(driver)
    actions.move_to_element(inputTag) # 将鼠标进行移动
    
    inputTag.send_keys('python') # 发送python文本
    actions.click(submitBn) # 对百度一下进行点击
    actions.perform() # 进行action操作
    
    
    # click_and_hold(element) 右键点击且不松手
    # context_click(element) # 右键点击
    # double_click(element) # 进行双击操作

    6. 对cookie进行操作,包括获取当前cookie,删除cookie

    from selenium import webdriver
    from selenium.webdriver.common.action_chains import ActionChains
    import time
    
    from selenium.webdriver.support.ui import Select
    driver = webdriver.Chrome(executable_path=r"C:Usersqq302Desktopchromedriver.exe")
    driver.get('https://www.baidu.com/')
    
    for cookie in driver.get_cookies(): # 获得所有的cookie信息
        print(cookie)
    
    driver.delete_cookie('BD_HOME') # 删除cookie
    print(driver.get_cookie('BD_HOME')) # 获得当前的cookie值
    driver.delete_all_cookies() # 删除所有的cookie值

    7. 隐式等待和显示等待

    # 隐式等待
    from selenium import webdriver
    from selenium.webdriver.common.action_chains import ActionChains
    import time
    
    from selenium.webdriver.support.ui import Select
    driver = webdriver.Chrome(executable_path=r"C:Usersqq302Desktopchromedriver.exe")
    driver.get('https://www.baidu.com/')
    # 隐式等待
    driver.implicitly_wait(10) # 最长等待10秒
    driver.find_element_by_id('kw').send_keys('python')
    driver.find_element_by_id('su').click()
    
    # 显示等待, 如果在10秒内没有出现就爬出异常 
    # 显示等待
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    
    driver = webdriver.Chrome(executable_path=r"C:Usersqq302Desktopchromedriver.exe")
    driver.get('https://www.baidu.com/')
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, 'kw'))
        )
    finally:
        print('end')

    8. 进行窗口的切换driver.switch_to.window(driver.window_handles[1])

    from selenium import webdriver
    
    
    driver_path = r'C:Usersqq302Desktopchromedriver.exe'
    driver = webdriver.Chrome(executable_path=driver_path)
    driver.get('https://www.baidu.com/')
    
    driver.execute_script("window.open('https://www.douban.com/')") # 打开豆瓣的网址
    driver.execute_script('window.scrollTo(0, 1000000)')# 进行窗口的刷新移动 
    print(driver.window_handles) # 打印文件句柄
    driver.switch_to.window(driver.window_handles[1])  # 切换driver的窗口
    print(driver.current_url)

    9.使用免费的代理ip打开网页

    # 9 设置代理Ip
    from selenium import webdriver
    
    options = webdriver.ChromeOptions() 
    options.add_argument('--proxy-server=http://61.189.242.243:55484')  # 61.189.242.243表示ip, 55484表示端口号
    
    driver = webdriver.Chrome(executable_path=r'C:Usersqq302Desktopchromedriver.exe', options=options)
    
    driver.get(r'http://httpbin.org/ip') # 使用代理ip打开网页

    10.webdriverElement补充操作

    # 10. WebElement元素补充操作
    from selenium import webdriver
    from selenium.webdriver.remote.webelement import WebElement
    
    
    driver_path = r'C:Usersqq302Desktopchromedriver.exe'
    driver = webdriver.Chrome(executable_path=driver_path)
    driver.get('https://www.baidu.com/')
    
    submitBn = driver.find_element_by_id('su')
    print(submitBn.get_attribute('value')) # 获得当前的value属性
    driver.save_screenshot('baidu.png') # 进行截图保存
  • 相关阅读:
    NSURLSession实现文件上传
    JS中如何判断null、undefined与NaN
    jquery
    url操作等
    设计模式
    javaScript类型转换
    jQuery.noop
    JavaScript严谨模式(Strict Mode)提升开发效率和质量
    Data URI
    e.target e.currenttarget
  • 原文地址:https://www.cnblogs.com/my-love-is-python/p/11360240.html
Copyright © 2011-2022 走看看