zoukankan      html  css  js  c++  java
  • Python selenium

    利用pip安装selenium 命令pip install selenium

    我们用selenium写个小例子,功能是打开百度主页,在搜索框中输入网络爬虫,进行搜索。代码如下

    #coding:utf-8
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    import time
    driver = webdriver.Firefox(executable_path='D:/Program Files (x86)/Mozilla Firefox/geckodriver.exe')

    driver.get("http://www.baidu.com")

    assert u"百度" in driver.title

    elem = driver.find_element_by_name('wd')

    elem.clear()

    elem.send_keys(u"网络爬虫")
    elem.send_keys(Keys.RETURN)
    time.sleep(3)
    assert u"网络爬虫" not in driver.page_source
    driver.close()

    如果出现以下错误:

    selenium.common.exceptions.WebDriverException: Message: 'geckodriver' executable
    needs to be in PATH.则需要下载geckodriver,该路径为geckodriver为存放目录D:/Program Files (x86)/Mozilla Firefox/geckodriver.exe

    通过selenium元素选取

    find_element_by_id            定位一个元素   find_elements_by_id           定位多个元素     通过元素id进行定位

    find_element_by_name                              find_elements_by_name                                通过元素名称进行定位

    find_element_by_xpath                              find_elements_by_xpath                                通过xpath表达式进行定位

    find_element_by_link_text                          find_elements_by_link_text                            通过完整超链接文本进行定位

    find_element_by_partial_link_text              find_elements_by_partial_link_text                 通过部分超链接文本进行定位

    find_element_by_tag_name                       find_elements_by_tag_name                          通过标记名称进行定位

    find_element_by_class_name                    find_elements_by_class_name                      通过类名进行定位

    find_element_by_css_selector                   find_elements_by_css_selector                      通过css选择器进行定位

    <html>
    <head>
    <meta http-equiv="content-type" content="text/html"; charset="utf-8">
    </head>
    <body>
    <h1> Welcome </h1>
    <p class="content">用户登录</p>
    <form id = “loginForm”>
    <select name="loginways">
    <option value="email">邮箱</option>
    <option value="mobile">手机号</option>
    <option value="name">用户名</option>
    </select>
    <br/>
    <input name ="username" type="text"/>
    <br/>
    密码
    <br/>
    <input name="password" type="password"/>
    <br/><br/>
    <input name ="continue" type="submit" value="Login"/>
    <input name ="continue" type="button" value="Clear"/>
    </form>
    <a href ="register.html">Register</a>
    </body>
    </html>

    #coding:utf-8
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.support.ui import Select #引入该包主要是用来操作select元素
    import time

    driver = webdriver.Firefox(c)

    driver.get("login.html")

    username = driver.find_element_by_name("username")
    password = driver.find_element_by_xpath(".//*[@id='loginForm']/input[2]")

    login_button = driver.find_element_by_xpath("//input[@type='submit']")

    #获取select元素
    select = Select(driver.find_element_by_xpath('//from/select '))
    select.select_by_index(1)#根据索引选中一个元素
    select.select_by_visible_text("邮箱")#根据select option内容获取
    select.select_by_value(1)#根据值获取选项

    username.send_keys("paul")#向输入框中输入内容
    password.send_keys("floki")


    login_button.click()

    username.clear()#清除输入框内容
    password.clear()

    #元素的拖拽

    元素的拖拽即将一个元素拖到另一个元素的位置,类似于拼图。首先要找到源元素和目的元素,然后使用ActionChains类可以实现。代码如下

    from selenium.webdriver import ActionChains

    element = driver.find_element_by_name("source")

    target = driver.find_element_by_name("target")

    action_chains = ActionChains(driver)

    action_chains.drag_and_drop(element,target).perform()

    窗口和页面frame的切换

    一个浏览器一般都会开多个窗口,我们可以switch_to_window方法实现指定窗口的切换
    driver.switch_to_window("windowName")
    也可以通过window handle来获取每个窗口的操作对象.实例如下

    for handle in driver.window_handles:

    driver.switch_to_window(handle) 

    如果切换页面frame,可以使用switch_to_frame
    driver.switch_to_frame("frameName")
    driver.switch_to_frame("frameName.0.child")

    弹窗处理

    如果在处理页面的过程中,触发了某个事件,跳出弹框。可以使用switch_to_alert获取弹框对象,从而进行关闭弹框,获取弹框信息等操作

    alert = driver.switch_to_alert()
    alert.dismiss()

    历史记录

    操作页面的前进和后退功能

    driver.forward()
    driver.back()

    爬取去哪儿网

    # coding:utf-8
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.support.ui import Select # 引入该包主要是用来操作select元素
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.ui import WebDriverWait
    from bs4 import BeautifulSoup
    from datetime import timedelta
    import time,datetime
    import codecs

    class QunaSpider(object):

    def get_hotel(self,driver,to_city,fromdate,todate):

    ele_toCity = driver.find_element_by_name('toCity')
    ele_fromDate = driver.find_element_by_id('fromDate')
    ele_toDate = driver.find_element_by_id('toDate')
    ele_search = driver.find_element_by_class_name('search-btn')
    ele_toCity.clear()
    ele_toCity.send_keys(to_city)#输入框输入城市名称
    ele_toCity.click()
    ele_fromDate.clear()
    ele_fromDate.send_keys(fromdate)
    ele_toDate.clear()
    ele_toDate.send_keys(todate)
    ele_search.click()

    page_num =0

    while True:

    try :
    WebDriverWait(driver,10).until(EC.title_contains(unicode(to_city))


    )
    except Exception,e:
    print e
    break


    time.sleep(5)
    js = "window.scrollTo(0,document.body.scrollHeight);"
    driver.execute_script(js)
    time.sleep(5)

    htm_const = driver.page_source
    soup = BeautifulSoup(htm_const,'html.parser')
    infos = soup.find_all(class_='item_hotel_info')
    f = codecs.open(unicode(to_city)+unicode(fromdate)+u'.html','a','utf-8')

    for info in infos:

    f.write(str(page_num)+'--'*50)
    content = info.get_text().replace(" ","").replace(" ","").strip()
    for line in [ln for ln in content.splitlines() if ln.strip()]:
    f.write(line)
    f.write(' ')
    try:

    next_page = WebDriverWait(driver,10).until(
    EC.visibility_of(driver.find_element_by_css_selector(".item.next"))
    )
    next_page.click()
    page_num+=1
    time.sleep(10)
    except Exception,e:
    print e
    break
    f.close()

    def crawl(self,root_url,to_city):

    today = datetime.date.today().strftime('%Y-%m-%d')
    tomorrow = datetime.date.today() + datetime.timedelta(days=1)
    tomorrow = tomorrow.strftime('%Y-%m-%d')

    driver = webdriver.Firefox(executable_path='D:/Program Files (x86)/Mozilla Firefox/geckodriver.exe')
    driver.set_page_load_timeout(50)
    driver.get(root_url)
    driver.maximize_window()
    driver.implicitly_wait(10)
    self.get_hotel(driver,to_city,today,tomorrow)

    if __name__ == '__main__':

    spider = QunaSpider()
    spider.crawl('http://hotel.qunar.com/',u"上海")

  • 相关阅读:
    五秒原则,做一件事之前数 5 秒,1,2,3,4,5 立马去做。比如睡觉:数五秒,立马放下手机,闭眼。
    Perl 安装 JSON 包
    Perl: hash散列转换为Json报错集, perl.c,v $$Revision: 4.0.1.8 $$Date: 1993/02/05 19:39:30 $
    叫法: 表名 表字段名 定义每个表字段
    失误1: 把i放到循环体内部,i++失效
    沈南鹏@《遇见大咖》: A轮没投,投了8个月以后就证明了张一鸣是对了,在美国都没有张一鸣这种模式
    xshell通过xftp传输Windows文件到Linux:在输入put后,再摁 TAB 键,可显示当前文件夹的文件
    LeetCode84 Largest Rectangle in Histogram
    全排列问题及其引申问题
    LeetCode Weekly Contest 8
  • 原文地址:https://www.cnblogs.com/paulversion/p/8404267.html
Copyright © 2011-2022 走看看