zoukankan      html  css  js  c++  java
  • selenium

    driver.find_element_by_*('*')唯一时,等同driver.find_elements_by_*('*')[0],返回WebElement对象,有.send_keys()、.click()等定位法,以及.text、.get_attribute('*')提取自家标签的内容、属性值。
     
    Egの爬租房网agoda——
    class="LazyLoad",连续点翻页键到页底才捕获全。有的网站class="… invisible",也得用browser爬。
     
    import time,random
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.common.action_chains import ActionChains
     
    profile=webdriver.FirefoxProfile()
    profile.set_preference('permissions.default.image', 2)  #禁图片
    profile.set_preference('permissions.default.stylesheet', 2)  #禁css
    profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')  #禁flash
    profile.set_preference('javascript.enabled', 'false')  #禁js
     
    # ip='118.119.168.172:9999'   #代理
    # ip=[int(x) if x.isdigit() else x for x in ip.split(':')]    #端口是int型
    # profile.set_preference('network.proxy.type', 1)    #1是手动代理
    # profile.set_preference('network.proxy.http',ip[0])
    # profile.set_preference('network.proxy.http_port',ip[1])
    # profile.set_preference('network.proxy.ssl',ip[0])
    # profile.set_preference('network.proxy.ssl_port',ip[1])
    # profile.update_preferences()
     
    #火狐驱动geckodriver或chromedriver的所在目录,没在系统path里则加executable_path参数
    binary='C:/Program Files/Mozilla Firefox/firefox.exe'   #火狐的安装路径为自定义
    options=webdriver.firefox.options.Options();options.add_argument('-headless')  #静默模式
    driver=webdriver.Firefox(profile,binary,firefox_options=options)
     
    def run(url):
        driver.get(url)
        ActionChains(driver).click().perform()  #首次打开时被订房弹窗挡了,单击下使翻页键能用
        while True:
            #driver.execute_script('window.scrollTo(0,document.body.scrollHeight);')
            for x in range(20): #下页键不知点几次才能到网页底端,点20次看看
                ActionChains(driver).key_down(Keys.PAGE_DOWN).perform()
                time.sleep(random.random())
            t=driver.find_elements_by_css_selector('ol.hotel-list-container li div ul li h3 span') 
            print(len(t))
            for ele in t:  #只处理可见元素,有的网站用隐含字段(不看网页源代码没法知道)反爬
                if ele.is_displayed():print(ele.text)
            try:driver.find_element_by_id('paginationNext').click()  #除末页外,都有下一页按钮
            except:break
        driver.quit()
       
    run('https://www.agoda.com
    /zh-cn/pages/agoda/default/DestinationSearchResult.aspx?city=16670')
    ****************************************分割线****************************************
    局の粮药: 删除下文俩网址和一句中文里的和蟹字
     
    import time,re
    from selenium import webdriver
    driver=webdriver.Firefox()
    indexUrl='http://ap反p1.sf击da.g爬ov.cn/data虫search/fac虫e3/di虫r.html'
     
    def loginByBrowser():
        css = driver.find_element_by_css_selector
        driver.get(indexUrl)
        time.sleep(3)
        driver.find_element_by_partial_link_text('国①产①药①品(').click()
        pages=int(re.findall('第1页/共(d+)页',driver.page_source)[0])-1
        for page in range(pages):
            css('img[src="images/data删an除niu_07.gif"]').click()
     
    if __name__ == '__main__':
        loginByBrowser()
    ****************************************分割线****************************************
    登录百度:
     
    indexUrl='https://www.baidu.com/'
    userName='……'
    pwd='……'
     
    import time
    from selenium import webdriver
     
    options=webdriver.ChromeOptions()   #自定义路径、无地址栏信息条、无头、禁图、加载插件
    options.binary_location='D:/Program Files/Browser/CentBrowser/Application/chrome.exe'
    options.add_argument('disable-infobars')
    #options.add_argument('headless')    #有验证码,本例不用无头模式也不禁图片
    #options.add_experimental_option('prefs',{'profile.managed_default_content_settings.images':2})
    options.add_extension('D:/广告终结者 3.2.2.crx')
    driver=webdriver.Chrome(chrome_options=options)
     
    def loginByBrowser():
        driver.get(indexUrl)
        css=driver.find_element_by_css_selector
        css('#u1> a.lb').click()
        time.sleep(2)
        css('.tang-pass-footerBarULogin').click()
        css('[id$=userName]').send_keys(userName)
        css('[id$=password]').send_keys(pwd)
        css('#TANGRAM__PSP_10__submit').click()
        input('浏览器端手动输完验证码后,在本句句尾任敲一字母:')
        try:css('#TANGRAM__PSP_10__submit').click()
        except:pass
     
    if __name__ == '__main__':
        loginByBrowser()
    ****************************************分割线****************************************
    登录58同城:
     
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    import time
     
    profile = webdriver.FirefoxProfile()
    profile.set_preference('permissions.default.image', 2)  #禁图片
    profile.set_preference('permissions.default.stylesheet', 2) #禁css
    profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')    #禁flash
    profile.set_preference('javascript.enabled', 'false')   #禁js
    options=webdriver.firefox.options.Options();
    options.add_argument('-headless')
    driver=webdriver.Firefox(profile,firefox_options=options)
     
    def loginByBrowser(userName,pwd):
        driver.get('http://passport.58.com/login')
        css=driver.find_element_by_css_selector
        css('#pwdLogin').click()
        css('#usernameUser').send_keys(userName)
        css('#passwordUserText').send_keys(pwd,Keys.ENTER)
        input('浏览器端手动输入短信验证码并点击确定后,在本句句尾任敲一字母:')
        time.sleep(4)
        driver.find_element_by_link_text('退出').click()
     
        driver.quit()
     
    loginByBrowser('用户名','密码')
    ****************************************分割线****************************************
    # 核对省校平台申请毕业或学位的学生:
     
    from selenium import webdriver
    from selenium.webdriver.support.select import Select
    from openpyxl import Workbook
     
    driver=webdriver.Firefox()    #验证码是图片,故本例不禁图
    css=driver.find_element_by_css_selector
    allPage=[]
     
    def loginByBrowser(userName,pwd):
        driver.get('http://222.19.127.21/PRTVUWeb/pages/common/frameset.jsp')
        css('[name*=j_username]').send_keys(userName)
        css('[name=j_password]').send_keys(pwd)
     
    def graduateApplication():
        #登录后的首个视图下,定位插件如SelectorGadget竟启动不了:直接get抓包发现的真实网址
        driver.get('http://222.19.127.21/PRTVUWeb/pages/graduate/querystugraduate.jsp')
        Select(css('[name=ifAppGraduate]')).select_by_value('1')   #下拉框の申请毕业-存在
        css('[type=submit]').click()
        
    def studentsInfo():
        page=driver.find_element_by_css_selector('[name=curPage]').get_attribute('value')
        students=driver.find_elements_by_css_selector('tr[align=center][bgcolor]')  #多属性定位
        print(f'输出第{page}页,人数:{len(students)}')
        for stu in students:
            info = stu.text.split()
            student=info[:2]+info[3:7]+info[8:10]  #[ ]的.extend()、.append()返回None,+为[]
            allPage.append(student)
        try:    #末页无下一页按钮
            css('[value=下一页]').click()
            studentsInfo()
        except:pass
     
    def saveToExcel():
        wb=Workbook()
        ws=wb.active
        ws.append(['序号','学习中心','学号','姓名','专业','层次','申请毕业','申请学位'])
        for student in allPage:
            ws.append(student)
        wb.save('E:/省校平台查毕业申请.xlsx')
     
    if __name__ == '__main__':
        loginByBrowser('用户名','密码')
        input('浏览器端手动输完验证码并点击登录后,在本句句尾任敲一字母:')
        graduateApplication()
        studentsInfo()
        saveToExcel()
    ****************************************分割线****************************************
    某宝的物品搜索:
     
    from selenium import webdriver
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as ec
    from selenium.webdriver.common.by import By
    from selenium.common.exceptions import TimeoutException
    from bs4 import BeautifulSoup
     
    driver = webdriver.Firefox()    #若驱动.exe的目录没在系统path,则()内写出其路径
    wait=WebDriverWait(driver,9)
    css=By.CSS_SELECTOR
     
    def response():
        wait.until(ec.presence_of_all_elements_located((css,'#mainsrp-itemlist .item')))
        driver.execute_script('window.stop()')    #加载出需要的所有信息后,就停止加载
        soup=BeautifulSoup(driver.page_source,'lxml')
        items=soup.find('div','m-itemlist').find_all('div','item')
        for item in items:    #36个item,缺少api网址中的12个
            product={'img':item.img['data-src']+'_360x360Q90.jpg',
                'price':item.find('div','price').text.strip(),'sales':item.find('div','deal-cnt').text[:-3],
                'title':item.img['alt'].split()[0],'location':item.find('div','location').text}
            print(product)
    def search(commodity):
        driver.get('https://www.taobao.com/')
        wait.until(ec.presence_of_element_located((css,'#q'))).send_keys(f'{commodity}')
        wait.until(ec.element_to_be_clickable((css,'.btn-search'))).click()
        response()
     
    def nextPage(page):
        inputBox=wait.until(ec.presence_of_element_located((css,'[aria-label=页码输入框]')))
        inputBox.clear()
        inputBox.send_keys(page)
        wait.until(ec.element_to_be_clickable((css,'span.btn.J_Submit'))).click()
        wait.until(ec.text_to_be_present_in_element((css,'span.num'),str(page)))
        print(f'当前是第{page}页')
        response()
     
    if __name__ == '__main__':
        commodity=input('请输入要搜索的商品:')
        search(commodity)
        for x in range(2,10):
            nextPage(x)
  • 相关阅读:
    tar命令,vi编辑器
    Linux命令、权限
    Color Transfer between Images code实现
    利用Eclipse使用Java OpenCV(Using OpenCV Java with Eclipse)
    Matrix Factorization SVD 矩阵分解
    ZOJ Problem Set
    Machine Learning
    ZOJ Problem Set
    ZOJ Problem Set
    ZOJ Problem Set
  • 原文地址:https://www.cnblogs.com/scrooge/p/7693865.html
Copyright © 2011-2022 走看看