zoukankan      html  css  js  c++  java
  • day_5:动态渲染页面爬取

    一、Selenium

    1、声明浏览器对象

    from selenium import webdriver
    
    browser_chrome = webdriver.Chrome()
    browser_firefox = webdriver.Firefox()
    browser_edge = webdriver.Edge()
    browser_phantomjs = webdriver.PhantomJS()
    browser_safari = webdriver.safari()

    2、访问页面

    from selenium import webdriver
    
    browser = webdriver.Chrome()
    browser.get('https://www.taobao.com')
    print(browser.page_source)
    browser.close()

    3、查找节点

    # 定位单个节点
    find_element_by_id
    find_element_by_name
    find_element_by_xpath
    find_element_by_link_text #通过精确文本定位
    find_element_by_partial_link_text #通过模糊文本定位
    find_element_by_tag_name
    find_element_by_class_name
    find_element_by_css_selector
    
    # 定位多个节点
    find_elements_by_name
    find_elements_by_xpath
    find_elements_by_link_text
    find_elements_by_partial_link_text
    find_elements_by_tag_name
    find_elements_by_class_name
    find_elements_by_css_selector
    
    # 公共方法
    find_element(By.x, 'x')
    find_elements(By.x, 'x')
    
    #By类型
    ID = "id"
    XPATH = "xpath"
    LINK_TEXT = "link text"
    PARTIAL_LINK_TEXT = "partial link text"
    NAME = "name"
    TAG_NAME = "tag name"
    CLASS_NAME = "class name"
    CSS_SELECTOR = "css selector"
    # 4中方法查找淘宝首页搜索框节点
    from selenium import webdriver
    
    browser = webdriver.Chrome()
    browser.get('https://www.taobao.com')
    input_id = browser.find_element_by_id('q')
    input_name = browser.find_element_by_name('q')
    input_css = browser.find_element_by_css_selector('#q')
    input_xpath = browser.find_element_by_xpath('//*[@id="q"]')
    print(input_id)
    print(input_name)
    print(input_css)
    print(input_xpath)
    browser.close()
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    
    browser = webdriver.Chrome()
    browser.get('https://www.taobao.com')
    # find_element(查找方式By,查找值),以下两种方式等价 input_first = browser.find_element(By.ID, 'q') input_second = browser.find_element_by_id('q') print(input_first) print(input_second) browser.close()
    from selenium import webdriver
    
    browser = webdriver.Chrome()
    browser.get('https://www.taobao.com')
    
    # 查找淘宝左侧导航栏所有li标签
    lis = browser.find_elements_by_css_selector('.service-bd li')
    for li in lis:
        print(li)
    browser.close()
    from selenium import webdriver
    
    browser = webdriver.Chrome()
    
    browser.get('https://www.zhihu.com/explore')
    logo = browser.find_element_by_id('zh-top-link-logo')
    print(logo)
    # get_attribute(属性名称)获取节点属性
    print(logo.get_attribute('class'))
    print(logo.get_attribute('href'))
    print(logo.get_attribute('id'))
    print(logo.get_attribute('data-za-c'))
    
    # logo.text 获取节点文本
    print(logo.text)
    print(logo.id)
    
    # logo.location 获取节点位置
    print(logo.location)
    print(logo.parent)
    print(logo.rect)
    
    # logo.tag_name 获取节点名称
    print(logo.tag_name)
    print(logo.size)
    browser.close()

    4、节点交互

    # 淘宝搜索商品
    from selenium import webdriver
    import time
    
    browser = webdriver.Chrome()
    browser.get('https://www.taobao.com')
    
    # 查找输入框
    input = browser.find_element_by_id('q')
    
    # 在输入框中输入
    input.send_keys('iPhone') 
    time.sleep(1)
    
    # 清空输入框 
    input.clear()  
    
    # 在输入框中输入
    input.send_keys('iPad')  
    
    # 查找搜索按钮
    btn = browser.find_element_by_css_selector('.btn-search')
    
    # 点击按钮
    btn.click()  

    5、动作链(鼠标滑动、拖拽,键盘按键等)

    # select下拉框
    rom selenium.webdriver.support.ui import Select
    select = Select(driver.find_element_by_name('name'))
    select.select_by_index(index)  # 通过索引定位
    select.select_by_visible_text("text") # 通过文本定位
    select.select_by_value(value) # 通过值定位
    
    select = Select(driver.find_element_by_id('id'))
    select.deselect_all() # 取消所有选择
    # 拖放:将元素移动一定量,或者移动到另一个元素
    
    from selenium.webdriver import ActionChains
    
    element = driver.find_element_by_name("source")
    target = driver.find_element_by_name("target")
    
    action_chains = ActionChains(driver)
    action_chains.drag_and_drop(element, target).perform()
    # cookies
    
    driver.get("http://www.example.com")
    
    cookie = {'name' : 'foo', 'value': 'bar'}
    driver.add_cookie(cookie)
    
    driver.get_cookies()

    6、等待(页面加载过慢导致找不到节点,需要等待页面加载完成再找节点)显式等待设置的是最长等待时间,尽量使用显示等待、显等待待设置的是固定的等待时间

    # 显式等待
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    
    driver = webdriver.Firefox()
    driver.get("http://somedomain/url_that_delays_loading")
    try:
        element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "myDynamicElement"))
        )
    finally:
        driver.quit()
    

    # 隐式等待 from selenium import webdriver driver = webdriver.Firefox() driver.implicitly_wait(10) # seconds driver.get("http://somedomain/url_that_delays_loading") myDynamicElement = driver.find_element_by_id("myDynamicElement")

    7、Headless模式(无界面模式)

    from selenium import webdriver
    
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    driver = webdriver.Chrome(chrome_options=chrome_options)
    
    driver.get('https://www.taobao.com')
    print(driver.page_source)
  • 相关阅读:
    MyBatis映射文件中用#和$传递参数的特点
    使用谷歌浏览器进行Web开发技巧
    YYYY-mm-dd HH:MM:SS 备忘录
    java通过UUID生成16位唯一订单号
    idea如何设置类头注释和方法注释
    如何用符号构建人的思维系统?
    临界点思维模型
    复利思维模型-拥抱人生的指数增长
    提升自我认知的有效方式
    如何去培养顶尖的思维模型?
  • 原文地址:https://www.cnblogs.com/jp-mao/p/10041514.html
Copyright © 2011-2022 走看看