zoukankan      html  css  js  c++  java
  • selenium代码实例

    # 环境安装:pip install selenium
    #  编码流程:
            1.导报:from selenium import webdriver
            2. 实例化某一款浏览器对象
            3.自指定自动化操作代码

    # 使用下面的方法,查找指定的元素进行操作
        find_element_by_id            根据id找节点
        find_elements_by_name         根据name找
        find_elements_by_xpath        根据xpath查找
        find_elements_by_tag_name     根据标签名找
        find_elements_by_class_name   根据class名字查找
    # 截屏保存
    browser.save_screenshot(r'phantomjsaidu.png')
    # 退出驱动程序
    driver.quit()
    # 自动打开百度搜索人民币
    # 自动打开百度搜索人民币
    from selenium import webdriver
    from time import sleep
    bro
    = webdriver.Chrome(executable_path=r'C:UsersAdministratorDesktopchromedriver_win32chromedriver.exe') bro.get(url='https://www.baidu.com/') sleep(2) text_input = bro.find_element_by_id('kw') # send_keys 给input标签输入 text_input.send_keys('人民币') sleep(2) bro.find_element_by_id('su').click() sleep(3) #获取当前的页面源码数据(渲染后的数据) print(bro.page_source) bro.quit()
    #获取豆瓣电影中更多电影详情数据
    #获取豆瓣电影中更多电影详情数据
    from selenium import webdriver
    from time import sleep
    
    #谷歌无头浏览器
    from selenium.webdriver.chrome.options import Options
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    
    url = 'https://movie.douban.com/typerank?type_name=%E6%83%8A%E6%82%9A&type=19&interval_id=100:90&action='
    bro = webdriver.Chrome(executable_path=r'C:UsersAdministratorDesktopchromedriver_win32chromedriver.exe',chrome_options=chrome_options)
    bro.get(url)
    sleep(3)
    # 执行JS代码,自动向下划 bro.execute_script(
    'window.scrollTo(0,document.body.scrollHeight)') sleep(3) bro.execute_script('window.scrollTo(0,document.body.scrollHeight)') sleep(3) bro.execute_script('window.scrollTo(0,document.body.scrollHeight)') sleep(2) page_text = bro.page_source with open('./douban.html','w',encoding='utf-8') as fp: fp.write(page_text) sleep(1) bro.quit()
    #登录qq空间爬取主页
    from selenium import webdriver
    from time import sleep
    bro = webdriver.Chrome(executable_path=r'C:UsersAdministratorDesktopchromedriver_win32chromedriver.exe')
    url = 'https://qzone.qq.com/'
    bro.get(url=url)
    sleep(2)
    #定位到一个具体的iframe
    bro.switch_to.frame('login_frame')
    bro.find_element_by_id('switcher_plogin').click()
    sleep(2)
    
    bro.find_element_by_id('u').send_keys('332424')
    bro.find_element_by_id('p').send_keys('dsaafa020@')
    
    bro.find_element_by_id('login_button').click()
    
    sleep(5)
    
    page_text = bro.page_source
    with open('qq.html','w',encoding='utf-8') as fp:
        fp.write(page_text)
    bro.quit()

    PhantomJS使用(做无头浏览器)(被弃用)

    PhantomJS的作者ariya在PhantomJS的GitHub页面的issue #15344中写道:由于缺乏积极的贡献,我将会存档该项目。如果将来我们又重新开发这个项目的话,这个项目还会被取出来。因此,所有的之前的关于PhantomJS 2.5(由 @Vitallium 提起)和PhantomJS 2.1.x(由 @pixiuPL 提起)的计划也会废弃。接下来,为了防止混淆,上述被废弃的版本的源码和二进制包也会被删除。在未来的通知之前,PhantomJS 2.1.1将会是已知最后的稳定版本。

    #获取豆瓣电影中更多电影详情数据
    from selenium import webdriver
    from time import sleep
    
    url = 'https://movie.douban.com/typerank?type_name=%E6%83%8A%E6%82%9A&type=19&interval_id=100:90&action='
    bro = webdriver.PhantomJS(executable_path=r'C:UsersAdministratorDesktop爬虫+数据day_03_爬虫phantomjs-2.1.1-windowsinphantomjs.exe')
    bro.get(url)
    sleep(3)
    bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    sleep(3)
    bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    sleep(3)
    bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    sleep(2)
    page_text = bro.page_source
    
    with open('./douban.html','w',encoding='utf-8') as fp:
        fp.write(page_text)
    
    sleep(1)
    bro.quit()
    

     # 爬取微信公众号文章

    from selenium import webdriver
    from lxml import etree
    
    #谷歌无头浏览器
    from selenium.webdriver.chrome.options import Options
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    
    # Fiddler抓包公众号历史文章URL url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MjM5NzU0MzU0Nw==&scene=124&uin=MzQxNDc2MTIxOQ%3D%3D&key=5fa6
    7e91c99877c92cab8f76d9eba741f20e126dcf62c0a8a42af6c159ae91cc6d9b27dd799b89357259a82e1375e1f275a1960f43e003ac9b5baba11703172d08c
    866f9bd6aa20534932779237f7fe8&devicetype=Windows+7&version=62080085&lang=zh_CN&a8scene=7&pass_ticket=bB%2BcRIlVVqJKLAN%2FLxVVoWiJ
    XecI7JA3Ttwfs%2FWX0zIjxaW1KxSt6Z2wvmXr8tv0&winzoom=1'
    bro = webdriver.Chrome(executable_path=r'C:UsersAdministratorDesktopchromedriver_win32chromedriver.exe',chrome_options=chrome_options) bro.get(url) sleep(3) bro.execute_script('window.scrollTo(0,document.body.scrollHeight)') sleep(3) bro.execute_script('window.scrollTo(0,document.body.scrollHeight)') sleep(3) bro.execute_script('window.scrollTo(0,document.body.scrollHeight)') sleep(2) page_text = bro.page_source with open('./douban.html','w',encoding='utf-8') as fp: fp.write(page_text) sleep(1) bro.quit() with open('./douban.html','r',encoding="utf-8") as f: text_html=f.read() etree_page=etree.HTML(text_html) # 获取所有文章的链接 div_list=etree_page.xpath("//div[@class='weui_media_box appmsg js_appmsg']/@hrefs") # 下载公众号文章每篇文章 for url in div_list: try: bro = webdriver.Chrome(executable_path=r'C:UsersAdministratorDesktopchromedriver_win32chromedriver.exe',chrome_options=chrome_options) bro.get(url) page_text = bro.page_source t=etree.HTML(page_text) text=t.xpath("//h2[@id='activity-name']/text()")[0].strip() filename=r"C:UsersAdministratorDesktophtml\%s.html" % text with open(filename,'w',encoding='utf-8') as fp: fp.write(page_text) print(page_text) except Exception as e: print(e) bro.quit()

      

      

  • 相关阅读:
    中科院大牛博士是如何进行文献检索和阅读(好习惯受益终生)(转载)
    大家平常都喜欢看哪些人的博客,共享…… ZZ (水木)
    heart ultrasound from american society of echocardiography
    用TWaver实现组合的Chart
    Hello TWaver Android
    关注细节-TWaver Android
    TWaver Java内存占用测试
    TWaver Android 概述
    机器学习中的相似性度量
    最大熵模型(一)
  • 原文地址:https://www.cnblogs.com/lujiacheng-Python/p/12489441.html
Copyright © 2011-2022 走看看