zoukankan      html  css  js  c++  java
  • selenium

    driver.find_element_by_*('*')唯一时,等同driver.find_elements_by_*('*')[0],返回WebElement对象,有.send_keys()、.click()等定位法,以及.text、.get_attribute('*')提取自家标签的内容、属性值。
     
    Egの爬租房网agoda——
    class="LazyLoad",连续点翻页键到页底才捕获全。有的网站class="… invisible",也得用browser爬。
     
    import time,random
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.common.action_chains import ActionChains
     
    profile=webdriver.FirefoxProfile()
    profile.set_preference('permissions.default.image', 2)  #禁图片
    profile.set_preference('permissions.default.stylesheet', 2)  #禁css
    profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')  #禁flash
    profile.set_preference('javascript.enabled', 'false')  #禁js
     
    # ip='118.119.168.172:9999'   #代理
    # ip=[int(x) if x.isdigit() else x for x in ip.split(':')]    #端口是int型
    # profile.set_preference('network.proxy.type', 1)    #1是手动代理
    # profile.set_preference('network.proxy.http',ip[0])
    # profile.set_preference('network.proxy.http_port',ip[1])
    # profile.set_preference('network.proxy.ssl',ip[0])
    # profile.set_preference('network.proxy.ssl_port',ip[1])
    # profile.update_preferences()
     
    #火狐驱动geckodriver或chromedriver的所在目录,没在系统path里则加executable_path参数
    binary='C:/Program Files/Mozilla Firefox/firefox.exe'   #火狐的安装路径为自定义
    options=webdriver.firefox.options.Options();options.add_argument('-headless')  #静默模式
    driver=webdriver.Firefox(profile,binary,firefox_options=options)
     
    def run(url):
        driver.get(url)
        ActionChains(driver).click().perform()  #首次打开时被订房弹窗挡了,单击下使翻页键能用
        while True:
            #driver.execute_script('window.scrollTo(0,document.body.scrollHeight);')
            for x in range(20): #下页键不知点几次才能到网页底端,点20次看看
                ActionChains(driver).key_down(Keys.PAGE_DOWN).perform()
                time.sleep(random.random())
            t=driver.find_elements_by_css_selector('ol.hotel-list-container li div ul li h3 span') 
            print(len(t))
            for ele in t:  #只处理可见元素,有的网站用隐含字段(不看网页源代码没法知道)反爬
                if ele.is_displayed():print(ele.text)
            try:driver.find_element_by_id('paginationNext').click()  #除末页外,都有下一页按钮
            except:break
        driver.quit()
       
    run('https://www.agoda.com
    /zh-cn/pages/agoda/default/DestinationSearchResult.aspx?city=16670')
    ****************************************分割线****************************************
    局の粮药: 删除下文俩网址和一句中文里的和蟹字
     
    import time,re
    from selenium import webdriver
    driver=webdriver.Firefox()
    indexUrl='http://ap反p1.sf击da.g爬ov.cn/data虫search/fac虫e3/di虫r.html'
     
    def loginByBrowser():
        css = driver.find_element_by_css_selector
        driver.get(indexUrl)
        time.sleep(3)
        driver.find_element_by_partial_link_text('国①产①药①品(').click()
        pages=int(re.findall('第1页/共(d+)页',driver.page_source)[0])-1
        for page in range(pages):
            css('img[src="images/data删an除niu_07.gif"]').click()
     
    if __name__ == '__main__':
        loginByBrowser()
    ****************************************分割线****************************************
    登录百度:
     
    indexUrl='https://www.baidu.com/'
    userName='……'
    pwd='……'
     
    import time
    from selenium import webdriver
     
    options=webdriver.ChromeOptions()   #自定义路径、无地址栏信息条、无头、禁图、加载插件
    options.binary_location='D:/Program Files/Browser/CentBrowser/Application/chrome.exe'
    options.add_argument('disable-infobars')
    #options.add_argument('headless')    #有验证码,本例不用无头模式也不禁图片
    #options.add_experimental_option('prefs',{'profile.managed_default_content_settings.images':2})
    options.add_extension('D:/广告终结者 3.2.2.crx')
    driver=webdriver.Chrome(chrome_options=options)
     
    def loginByBrowser():
        driver.get(indexUrl)
        css=driver.find_element_by_css_selector
        css('#u1> a.lb').click()
        time.sleep(2)
        css('.tang-pass-footerBarULogin').click()
        css('[id$=userName]').send_keys(userName)
        css('[id$=password]').send_keys(pwd)
        css('#TANGRAM__PSP_10__submit').click()
        input('浏览器端手动输完验证码后,在本句句尾任敲一字母:')
        try:css('#TANGRAM__PSP_10__submit').click()
        except:pass
     
    if __name__ == '__main__':
        loginByBrowser()
    ****************************************分割线****************************************
    登录58同城:
     
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    import time
     
    profile = webdriver.FirefoxProfile()
    profile.set_preference('permissions.default.image', 2)  #禁图片
    profile.set_preference('permissions.default.stylesheet', 2) #禁css
    profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')    #禁flash
    profile.set_preference('javascript.enabled', 'false')   #禁js
    options=webdriver.firefox.options.Options();
    options.add_argument('-headless')
    driver=webdriver.Firefox(profile,firefox_options=options)
     
    def loginByBrowser(userName,pwd):
        driver.get('http://passport.58.com/login')
        css=driver.find_element_by_css_selector
        css('#pwdLogin').click()
        css('#usernameUser').send_keys(userName)
        css('#passwordUserText').send_keys(pwd,Keys.ENTER)
        input('浏览器端手动输入短信验证码并点击确定后,在本句句尾任敲一字母:')
        time.sleep(4)
        driver.find_element_by_link_text('退出').click()
     
        driver.quit()
     
    loginByBrowser('用户名','密码')
    ****************************************分割线****************************************
    # 核对省校平台申请毕业或学位的学生:
     
    from selenium import webdriver
    from selenium.webdriver.support.select import Select
    from openpyxl import Workbook
     
    driver=webdriver.Firefox()    #验证码是图片,故本例不禁图
    css=driver.find_element_by_css_selector
    allPage=[]
     
    def loginByBrowser(userName,pwd):
        driver.get('http://222.19.127.21/PRTVUWeb/pages/common/frameset.jsp')
        css('[name*=j_username]').send_keys(userName)
        css('[name=j_password]').send_keys(pwd)
     
    def graduateApplication():
        #登录后的首个视图下,定位插件如SelectorGadget竟启动不了:直接get抓包发现的真实网址
        driver.get('http://222.19.127.21/PRTVUWeb/pages/graduate/querystugraduate.jsp')
        Select(css('[name=ifAppGraduate]')).select_by_value('1')   #下拉框の申请毕业-存在
        css('[type=submit]').click()
        
    def studentsInfo():
        page=driver.find_element_by_css_selector('[name=curPage]').get_attribute('value')
        students=driver.find_elements_by_css_selector('tr[align=center][bgcolor]')  #多属性定位
        print(f'输出第{page}页,人数:{len(students)}')
        for stu in students:
            info = stu.text.split()
            student=info[:2]+info[3:7]+info[8:10]  #[ ]的.extend()、.append()返回None,+为[]
            allPage.append(student)
        try:    #末页无下一页按钮
            css('[value=下一页]').click()
            studentsInfo()
        except:pass
     
    def saveToExcel():
        wb=Workbook()
        ws=wb.active
        ws.append(['序号','学习中心','学号','姓名','专业','层次','申请毕业','申请学位'])
        for student in allPage:
            ws.append(student)
        wb.save('E:/省校平台查毕业申请.xlsx')
     
    if __name__ == '__main__':
        loginByBrowser('用户名','密码')
        input('浏览器端手动输完验证码并点击登录后,在本句句尾任敲一字母:')
        graduateApplication()
        studentsInfo()
        saveToExcel()
    ****************************************分割线****************************************
    某宝的物品搜索:
     
    from selenium import webdriver
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as ec
    from selenium.webdriver.common.by import By
    from selenium.common.exceptions import TimeoutException
    from bs4 import BeautifulSoup
     
    driver = webdriver.Firefox()    #若驱动.exe的目录没在系统path,则()内写出其路径
    wait=WebDriverWait(driver,9)
    css=By.CSS_SELECTOR
     
    def response():
        wait.until(ec.presence_of_all_elements_located((css,'#mainsrp-itemlist .item')))
        driver.execute_script('window.stop()')    #加载出需要的所有信息后,就停止加载
        soup=BeautifulSoup(driver.page_source,'lxml')
        items=soup.find('div','m-itemlist').find_all('div','item')
        for item in items:    #36个item,缺少api网址中的12个
            product={'img':item.img['data-src']+'_360x360Q90.jpg',
                'price':item.find('div','price').text.strip(),'sales':item.find('div','deal-cnt').text[:-3],
                'title':item.img['alt'].split()[0],'location':item.find('div','location').text}
            print(product)
    def search(commodity):
        driver.get('https://www.taobao.com/')
        wait.until(ec.presence_of_element_located((css,'#q'))).send_keys(f'{commodity}')
        wait.until(ec.element_to_be_clickable((css,'.btn-search'))).click()
        response()
     
    def nextPage(page):
        inputBox=wait.until(ec.presence_of_element_located((css,'[aria-label=页码输入框]')))
        inputBox.clear()
        inputBox.send_keys(page)
        wait.until(ec.element_to_be_clickable((css,'span.btn.J_Submit'))).click()
        wait.until(ec.text_to_be_present_in_element((css,'span.num'),str(page)))
        print(f'当前是第{page}页')
        response()
     
    if __name__ == '__main__':
        commodity=input('请输入要搜索的商品:')
        search(commodity)
        for x in range(2,10):
            nextPage(x)
  • 相关阅读:
    38
    37
    学记
    36.java_exception_test
    c++中enum的用法——枚举类型
    35
    34
    33
    32
    31
  • 原文地址:https://www.cnblogs.com/scrooge/p/7693865.html
Copyright © 2011-2022 走看看