利用pip安装selenium 命令pip install selenium
我们用selenium写个小例子,功能是打开百度主页,在搜索框中输入网络爬虫,进行搜索。代码如下
#coding:utf-8
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
driver = webdriver.Firefox(executable_path='D:/Program Files (x86)/Mozilla Firefox/geckodriver.exe')
driver.get("http://www.baidu.com")
assert u"百度" in driver.title
elem = driver.find_element_by_name('wd')
elem.clear()
elem.send_keys(u"网络爬虫")
elem.send_keys(Keys.RETURN)
time.sleep(3)
assert u"网络爬虫" not in driver.page_source
driver.close()
如果出现以下错误:
selenium.common.exceptions.WebDriverException: Message: 'geckodriver' executable
needs to be in PATH.则需要下载geckodriver,该路径为geckodriver为存放目录D:/Program Files (x86)/Mozilla Firefox/geckodriver.exe
通过selenium元素选取
find_element_by_id 定位一个元素 find_elements_by_id 定位多个元素 通过元素id进行定位
find_element_by_name find_elements_by_name 通过元素名称进行定位
find_element_by_xpath find_elements_by_xpath 通过xpath表达式进行定位
find_element_by_link_text find_elements_by_link_text 通过完整超链接文本进行定位
find_element_by_partial_link_text find_elements_by_partial_link_text 通过部分超链接文本进行定位
find_element_by_tag_name find_elements_by_tag_name 通过标记名称进行定位
find_element_by_class_name find_elements_by_class_name 通过类名进行定位
find_element_by_css_selector find_elements_by_css_selector 通过css选择器进行定位
<html>
<head>
<meta http-equiv="content-type" content="text/html"; charset="utf-8">
</head>
<body>
<h1> Welcome </h1>
<p class="content">用户登录</p>
<form id = “loginForm”>
<select name="loginways">
<option value="email">邮箱</option>
<option value="mobile">手机号</option>
<option value="name">用户名</option>
</select>
<br/>
<input name ="username" type="text"/>
<br/>
密码
<br/>
<input name="password" type="password"/>
<br/><br/>
<input name ="continue" type="submit" value="Login"/>
<input name ="continue" type="button" value="Clear"/>
</form>
<a href ="register.html">Register</a>
</body>
</html>
#coding:utf-8
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select #引入该包主要是用来操作select元素
import time
driver = webdriver.Firefox(c)
driver.get("login.html")
username = driver.find_element_by_name("username")
password = driver.find_element_by_xpath(".//*[@id='loginForm']/input[2]")
login_button = driver.find_element_by_xpath("//input[@type='submit']")
#获取select元素
select = Select(driver.find_element_by_xpath('//from/select '))
select.select_by_index(1)#根据索引选中一个元素
select.select_by_visible_text("邮箱")#根据select option内容获取
select.select_by_value(1)#根据值获取选项
username.send_keys("paul")#向输入框中输入内容
password.send_keys("floki")
login_button.click()
username.clear()#清除输入框内容
password.clear()
#元素的拖拽
元素的拖拽即将一个元素拖到另一个元素的位置,类似于拼图。首先要找到源元素和目的元素,然后使用ActionChains类可以实现。代码如下
from selenium.webdriver import ActionChains
element = driver.find_element_by_name("source")
target = driver.find_element_by_name("target")
action_chains = ActionChains(driver)
action_chains.drag_and_drop(element,target).perform()
窗口和页面frame的切换
一个浏览器一般都会开多个窗口,我们可以switch_to_window方法实现指定窗口的切换
driver.switch_to_window("windowName")
也可以通过window handle来获取每个窗口的操作对象.实例如下
for handle in driver.window_handles:
driver.switch_to_window(handle)
如果切换页面frame,可以使用switch_to_frame
driver.switch_to_frame("frameName")
driver.switch_to_frame("frameName.0.child")
弹窗处理
如果在处理页面的过程中,触发了某个事件,跳出弹框。可以使用switch_to_alert获取弹框对象,从而进行关闭弹框,获取弹框信息等操作
alert = driver.switch_to_alert()
alert.dismiss()
历史记录
操作页面的前进和后退功能
driver.forward()
driver.back()
爬取去哪儿网
# coding:utf-8
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select # 引入该包主要是用来操作select元素
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
from datetime import timedelta
import time,datetime
import codecs
class QunaSpider(object):
def get_hotel(self,driver,to_city,fromdate,todate):
ele_toCity = driver.find_element_by_name('toCity')
ele_fromDate = driver.find_element_by_id('fromDate')
ele_toDate = driver.find_element_by_id('toDate')
ele_search = driver.find_element_by_class_name('search-btn')
ele_toCity.clear()
ele_toCity.send_keys(to_city)#输入框输入城市名称
ele_toCity.click()
ele_fromDate.clear()
ele_fromDate.send_keys(fromdate)
ele_toDate.clear()
ele_toDate.send_keys(todate)
ele_search.click()
page_num =0
while True:
try :
WebDriverWait(driver,10).until(EC.title_contains(unicode(to_city))
)
except Exception,e:
print e
break
time.sleep(5)
js = "window.scrollTo(0,document.body.scrollHeight);"
driver.execute_script(js)
time.sleep(5)
htm_const = driver.page_source
soup = BeautifulSoup(htm_const,'html.parser')
infos = soup.find_all(class_='item_hotel_info')
f = codecs.open(unicode(to_city)+unicode(fromdate)+u'.html','a','utf-8')
for info in infos:
f.write(str(page_num)+'--'*50)
content = info.get_text().replace(" ","").replace(" ","").strip()
for line in [ln for ln in content.splitlines() if ln.strip()]:
f.write(line)
f.write('
')
try:
next_page = WebDriverWait(driver,10).until(
EC.visibility_of(driver.find_element_by_css_selector(".item.next"))
)
next_page.click()
page_num+=1
time.sleep(10)
except Exception,e:
print e
break
f.close()
def crawl(self,root_url,to_city):
today = datetime.date.today().strftime('%Y-%m-%d')
tomorrow = datetime.date.today() + datetime.timedelta(days=1)
tomorrow = tomorrow.strftime('%Y-%m-%d')
driver = webdriver.Firefox(executable_path='D:/Program Files (x86)/Mozilla Firefox/geckodriver.exe')
driver.set_page_load_timeout(50)
driver.get(root_url)
driver.maximize_window()
driver.implicitly_wait(10)
self.get_hotel(driver,to_city,today,tomorrow)
if __name__ == '__main__':
spider = QunaSpider()
spider.crawl('http://hotel.qunar.com/',u"上海")