一、Selenium
1、声明浏览器对象
from selenium import webdriver browser_chrome = webdriver.Chrome() browser_firefox = webdriver.Firefox() browser_edge = webdriver.Edge() browser_phantomjs = webdriver.PhantomJS() browser_safari = webdriver.safari()
2、访问页面
from selenium import webdriver browser = webdriver.Chrome() browser.get('https://www.taobao.com') print(browser.page_source) browser.close()
3、查找节点
# 定位单个节点 find_element_by_id find_element_by_name find_element_by_xpath find_element_by_link_text #通过精确文本定位 find_element_by_partial_link_text #通过模糊文本定位 find_element_by_tag_name find_element_by_class_name find_element_by_css_selector # 定位多个节点 find_elements_by_name find_elements_by_xpath find_elements_by_link_text find_elements_by_partial_link_text find_elements_by_tag_name find_elements_by_class_name find_elements_by_css_selector # 公共方法 find_element(By.x, 'x') find_elements(By.x, 'x') #By类型 ID = "id" XPATH = "xpath" LINK_TEXT = "link text" PARTIAL_LINK_TEXT = "partial link text" NAME = "name" TAG_NAME = "tag name" CLASS_NAME = "class name" CSS_SELECTOR = "css selector"
# 4中方法查找淘宝首页搜索框节点 from selenium import webdriver browser = webdriver.Chrome() browser.get('https://www.taobao.com') input_id = browser.find_element_by_id('q') input_name = browser.find_element_by_name('q') input_css = browser.find_element_by_css_selector('#q') input_xpath = browser.find_element_by_xpath('//*[@id="q"]') print(input_id) print(input_name) print(input_css) print(input_xpath) browser.close()
from selenium import webdriver from selenium.webdriver.common.by import By browser = webdriver.Chrome() browser.get('https://www.taobao.com')
# find_element(查找方式By,查找值),以下两种方式等价 input_first = browser.find_element(By.ID, 'q') input_second = browser.find_element_by_id('q') print(input_first) print(input_second) browser.close()
from selenium import webdriver browser = webdriver.Chrome() browser.get('https://www.taobao.com') # 查找淘宝左侧导航栏所有li标签 lis = browser.find_elements_by_css_selector('.service-bd li') for li in lis: print(li) browser.close()
from selenium import webdriver browser = webdriver.Chrome() browser.get('https://www.zhihu.com/explore') logo = browser.find_element_by_id('zh-top-link-logo') print(logo) # get_attribute(属性名称)获取节点属性 print(logo.get_attribute('class')) print(logo.get_attribute('href')) print(logo.get_attribute('id')) print(logo.get_attribute('data-za-c')) # logo.text 获取节点文本 print(logo.text) print(logo.id) # logo.location 获取节点位置 print(logo.location) print(logo.parent) print(logo.rect) # logo.tag_name 获取节点名称 print(logo.tag_name) print(logo.size) browser.close()
4、节点交互
# 淘宝搜索商品 from selenium import webdriver import time browser = webdriver.Chrome() browser.get('https://www.taobao.com') # 查找输入框 input = browser.find_element_by_id('q') # 在输入框中输入 input.send_keys('iPhone') time.sleep(1) # 清空输入框 input.clear() # 在输入框中输入 input.send_keys('iPad') # 查找搜索按钮 btn = browser.find_element_by_css_selector('.btn-search') # 点击按钮 btn.click()
5、动作链(鼠标滑动、拖拽,键盘按键等)
# select下拉框 rom selenium.webdriver.support.ui import Select select = Select(driver.find_element_by_name('name')) select.select_by_index(index) # 通过索引定位 select.select_by_visible_text("text") # 通过文本定位 select.select_by_value(value) # 通过值定位 select = Select(driver.find_element_by_id('id')) select.deselect_all() # 取消所有选择
# 拖放:将元素移动一定量,或者移动到另一个元素 from selenium.webdriver import ActionChains element = driver.find_element_by_name("source") target = driver.find_element_by_name("target") action_chains = ActionChains(driver) action_chains.drag_and_drop(element, target).perform()
# cookies driver.get("http://www.example.com") cookie = {'name' : 'foo', 'value': 'bar'} driver.add_cookie(cookie) driver.get_cookies()
6、等待(页面加载过慢导致找不到节点,需要等待页面加载完成再找节点)显式等待设置的是最长等待时间,尽量使用显示等待、显等待待设置的是固定的等待时间
# 显式等待 from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC driver = webdriver.Firefox() driver.get("http://somedomain/url_that_delays_loading") try: element = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, "myDynamicElement")) ) finally: driver.quit()
# 隐式等待 from selenium import webdriver driver = webdriver.Firefox() driver.implicitly_wait(10) # seconds driver.get("http://somedomain/url_that_delays_loading") myDynamicElement = driver.find_element_by_id("myDynamicElement")
7、Headless模式(无界面模式)
from selenium import webdriver chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') driver = webdriver.Chrome(chrome_options=chrome_options) driver.get('https://www.taobao.com') print(driver.page_source)