zoukankan      html  css  js  c++  java
  • selenium的

    1.简介

    selenium可以认为是反反爬虫的最佳利器,它基本可以等同于真实的浏览器访问,用它可以加载到动态数据,也省去了cookie的操作,但是用这个有一个重大的效率问题。所以selenium可以用来爬取一些对爬虫限制很大的网站。

    2.基本使用

    #  -*-coding:utf8 -*-
    
    #selenium+chromedriver获取动态数据
    #selenium相当于是一个机器人。可以模拟人类在浏览器上的一些行为,比如点击,填充数据,删除cookie等。
    #chromedirver是一个驱动chrome浏览器的驱动程序,使用他才可以驱动浏览器。当然针对不同的浏览器有不同的driver
    #1.Chrome:https://sites.google.com/a/chromium.org/chromedriver/downloads
    #2.Firefox:https://github.com/mozilla/geckodriver/releases
    #3.Edge:https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver
    #4.Safari:https://webkit.org/blog/6900/webdriver-support-in-safari-10/
    
    #安装selenium
    #pip3 install selenium
    
    #安装chromedriver:下载完成后,放到不需要权限的纯英文目录下即可
    
    
    from selenium import webdriver
    
    driver_path=r'D:chromedriverchromedriver.exe'
    driver=webdriver.Chrome(executable_path=driver_path)
    driver.get('https://www.baidu.com')
    # print(driver.page_source)
    
    import time
    time.sleep(5)
    #关闭页面
    # driver.close()   #关闭页面
    # driver.quit()   #退出整个浏览器
    View Code

    3.查找元素

    #  -*-coding:utf8 -*-
    
    from selenium import webdriver
    
    driver_path=r'D:chromedriverchromedriver.exe'
    driver=webdriver.Chrome(executable_path=driver_path)
    driver.get('https://www.baidu.com')
    
    
    from selenium.webdriver.common.by import By
    #定位元素
    '''
    1.find_element_by_id:根据id来查找某个元素
    inputTag=driver.find_element_by_id('kw')
    inputTag=driver.find_element(By.ID,'kw')
    2.find_element_by_class_name:根据类名查找元素
    submitTag=driver.find_element_by_class_name('su')
    submitTag=driver.find_element(By.CLASS_NAME,'su')
    3.find_element_by_name:根据name属性的值来查找元素
    submitTag=driver.find_element(By.NAME,'su')
    submitTag=driver.find_element_by_name('su')
    4.find_element_by_tag_name:根据标签名来查找元素
    submitTag=driver.find_element_by_tag_name('div')
    submitTag=driver.find_element(By.TAG_NAME,'div')
    5.find_element_by_xpath:根据xpath语法来获取元素
    submitTag=driver.find_element_by_xpath('//div')
    submitTag=driver.find_element(By.XPATH,'//div')
    6.find_element_by_css_selector:根据css选择器选择元素
    submitTag=driver.find_element(By.CSS_SELECTOR,'//div')
    submitTag=driver.find_element_by_css_selector('//div')
    
    每个方法都有find_element_by和find_elements_by:查找一个和查找多个的区别
    '''
    # inputTag=driver.find_element_by_id('kw')
    # inputTag=driver.find_element_by_name('wd')
    # inputTag=driver.find_element_by_class_name('s_ipt')
    # inputTag=driver.find_element_by_xpath('//input[@id="kw"]')
    # inputTag=driver.find_element_by_css_selector('.quickdelete-wrap > input')
    inputTag=driver.find_elements_by_css_selector('.quickdelete-wrap > input')[0]
    inputTag.send_keys('python')
    
    #1.如果只是想要解析网页中的数据,那么推荐将网页源代码扔给lxml来解析。因为lxml底层
    #使用的是C语言,所以解析效率会更高
    #2.如果是想要对元素进行一些操作,比如给一个文本输入值,或者是点击某个按钮,那么就必须使用selenuim给
    #我们提供的查找元素的方法
    
    from selenium import webdriver
    from lxml import etree
    from selenium.webdriver.common.by import By
    
    #使用By查找元素
    driver_path='D:chromedriverchromedriver'
    driver=webdriver.Chrome(executable_path=driver_path)
    driver.get('https://www.baidu.com')
    
    inputTag=driver.find_element(By.ID,'kw')
    inputTag.send_keys('python')
    
    # get_attribute('innerHTML')的方式可以获取某个元素的html
    # get_attribute('outerHTML')
    View Code

    4.操作表单元素

    #  -*-coding:utf8 -*-
    
    # 常见的表单元素
    # button
    # checkbox
    # select,下拉列表
    # input
    
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    import time
    
    driver_path = r'D:chromedriverchromedriver'
    driver = webdriver.Chrome(executable_path=driver_path)
    # driver.get('https://www.baidu.com')
    #
    # inputTag = driver.find_element(By.ID, 'kw')
    # #输入内容python
    # inputTag.send_keys('python')
    # time.sleep(5)
    # #清除输入的内容
    # inputTag.clear()
    
    #checkbox   选中
    # driver.get('https://www.douban.com')
    # rememberBth=driver.find_element_by_name('remember')
    # rememberBth.click()
    
    
    #选择select
    # driver.get('http://www.dobai.cn/')
    from selenium.webdriver.support.ui import Select
    #使用选择,要用Select先进行实例化
    # selectBtn=Select(driver.find_element_by_name('jumpMenu'))
    #通过索引进行选择
    # selectBtn.select_by_index(1)
    #通过value进行选择
    # selectBtn.select_by_value('http://m.95xiu.com/')
    #通过可见文本选择
    # selectBtn.select_by_visible_text('95秀客户端')
    #取消所有选中
    # selectBtn.deselect_all()
    
    
    
    #按钮的点击事件
    driver.get('https://www.baidu.com')
    
    inputTag=driver.find_element_by_id('kw')
    inputTag.send_keys('python')
    submitTag=driver.find_element_by_id('su')
    time.sleep(5)
    submitTag.click()
    View Code

    5.行为链

    #  -*-coding:utf8 -*-
    
    #行为链
    #有时候在页面中的操作可能要有很多步,那么这时候可以使用鼠标行为链类ActionChains来完成。
    #行为链在爬虫中使用不是很多
    from selenium import webdriver
    from selenium.webdriver.common.action_chains import ActionChains
    import time
    
    driver_path = r'D:chromedriverchromedriver'
    driver = webdriver.Chrome(executable_path=driver_path)
    driver.get('https://www.baidu.com')
    
    inputTag=driver.find_element_by_id('kw')
    submitBtn=driver.find_element_by_id('su')
    
    actions=ActionChains(driver)
    actions.move_to_element(inputTag)
    actions.send_keys_to_element(inputTag,'python')
    actions.move_to_element(submitBtn)
    actions.click()
    actions.perform()
    
    #还有更多的鼠标相关的操作。
    #click_and_hold(element):点击但不松开鼠标
    #context_click(element):右键点击
    #double_click(element):双击
    View Code

    6.操作cookie

    #  -*-coding:utf8 -*-
    
    #cookie操作
    #1.获取所有的cookie
    # driver.get_cookies()
    #2.根据cookie的key获取value
    # value=driver.get_cookies(key)
    #3.删除所有的cookie
    # driver.delete_all_cookies()
    #4.删除某个cookie
    # driver.delete_cookie(key)
    
    import time
    from selenium import webdriver
    driver_path=r'D:chromedriverchromedriver'
    driver=webdriver.Chrome(executable_path=driver_path)
    #只能获得当前网页https://www.baidu.com的所有cookie信息,并不能获取其他页面的cookie信息
    driver.get('https://www.baidu.com')
    
    # for cookie in driver.get_cookies():
    #     print(cookie)
    
    print(driver.get_cookie('PSTM'))
    
    # driver.delete_cookie('PSTM')
    # print(driver.get_cookie('PSTM'))
    
    #删除所有cookie
    # driver.delete_all_cookies()
    View Code

    7.隐式等待和显式等待

    #  -*-coding:utf8 -*-
    from selenium import webdriver
    from selenium.webdriver.common.action_chains import ActionChains
    import time
    
    driver_path = r'D:chromedriverchromedriver'
    driver = webdriver.Chrome(executable_path=driver_path)
    
    # 页面等待:
    # 现在的网页越来越多采用了Ajax技术,这样程序便不能确定何时某个元素完全加载出来了。
    # 如果实际页面等待时间过长导致某个dom元素还没出来,但是代码直接使用了这个webElement,
    # 那么就会抛出NullPointer的异常。为了解决这个问题,所以Selenium提供了
    # 两种等待方式:一种是隐式等待、一种是显式等待
    
    # 1.隐式等待:调用driver.implicitly_wait。那么在获取不可用的元素之前,会先等待10秒
    driver.get('https://www.douban.com/')
    #不设置等待时间,马上报错
    # driver.find_element_by_id('shdiasjdsdas')
    #设置等待时间
    # driver.implicitly_wait(20)
    # driver.find_element_by_id('shdiasjdsdas')#等待20s才报错
    
    
    # 2.显式等待:显式等待是表名某个条件成立后才执行获取元素的操作。也可以在等待的时候指定一个
    #显示等待更加智能一点
    # 最大的时间,如果超过这个时间那么就抛出一个异常
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.common.by import By
    #后面可以接一个条件
    # WebDriverWait(driver,10).until(
    #     #某个元素加载完成,只能传一个参数.所以以一个元组传进去
    #     EC.presence_of_element_located((By.ID,'asdasdasdasda'))
    # )
    
    #如果能获取的话,则不会等待10s
    element=WebDriverWait(driver,10).until(
        EC.presence_of_element_located((By.ID,'anony-book'))
    )
    print(element)
    View Code

    8.打开多窗口和切换窗口

    #  -*-coding:utf8 -*-
    
    #切换页面:
    #有时候窗口中有很多子tab页面,这时候肯定是需要进行切换的。selenium提供了一个叫做switch_to_window
    #来进行切换,具体切换到哪个页面,可以从driver.window_handles中找到
    
    from selenium import webdriver
    import time
    
    driver_path = r'D:chromedriverchromedriver'
    driver = webdriver.Chrome(executable_path=driver_path)
    
    driver.get('https://www.baidu.com')
    #又打开一个豆瓣页面
    driver.execute_script("window.open('https://www.douban.com')")
    #但是打印当前所在网页的话,显示还是百度,要去豆瓣页面的话,需要进行切换
    print(driver.current_url)
    
    #switch_to_window来进行切换页面
    #要切换页面首先要拿到窗口句柄,driver每打开一个页面,会有一个句柄
    #放在wind_handles里
    # print(driver.window_handles)
    driver.switch_to.window(driver.window_handles[1])
    print(driver.current_url)
    View Code

    9.使用代理

    #  -*-coding:utf8 -*-
    
    #设置代理ip:
    #有时候频繁爬取一些网页,服务器发现你是爬虫后会封掉你的ip地址。这时我们可以更换代理ip。
    from selenium import webdriver
    
    options=webdriver.ChromeOptions()
    options.add_argument('--proxy-server=http://113.124.87.163:9999')
    driver_path = r'D:chromedriverchromedriver'
    driver=webdriver.Chrome(executable_path=driver_path,chrome_options=options)
    driver.get('http://httpbin.org/ip')
    View Code

    10.补充

    #  -*-coding:utf8 -*-
    
    from selenium import webdriver
    from selenium.webdriver.remote.webdriver import WebElement
    driver_path = r'D:chromedriverchromedriver'
    driver=webdriver.Chrome(executable_path=driver_path)
    driver.get('https://www.baidu.com')
    
    submitBtn=driver.find_element_by_id('su')
    # print(type(submitBtn))
    print(submitBtn.get_attribute('value'))
    driver.save_screenshot('baidu.png')#保存屏幕截图
    View Code

    11.实战:爬取拉勾网

    #  -*-coding:utf8 -*-
    import re
    import requests
    import time
    from lxml import etree
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36',
        'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
        'Cookie':'JSESSIONID=ABAAABAAAGFABEFFA5F21EB50BF5A6DCE619C8EEA6CB14A; SEARCH_ID=1146364cc73d498abea7c5b4dde4c1e3; user_trace_token=20190417144437-71ba273c-c709-43be-ae40-d1c531c2a4d7; X_HTTP_TOKEN=42daf4b72327b2817743845551bf5e71415983ed09'
    }
    
    #拉勾设置了反爬机制,每次cookie都会变,该怎么做?后面再学
    def request_list_page():
        url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
        data = {
            'first': False,
            'pn': 1,
            'kd': 'python'
        }
        for x in range(1, 14):
            data['pn'] = x
            response = requests.post(url, headers=headers, data=data)
            result = response.json()
            positions = result['content']['positionResult']['result']
            for position in positions:
                positionId = position['positionId']
                position_url = 'http://www.lagou.com/jobs/%s.html' % positionId
                parse_position_detail(position_url)
                break
            break
            # print(response.json())
            # json方法,如果返回来的是json数据,这个方法会自动load成字典
    
    
    def parse_position_detail(url):
        response = requests.get(url, headers=headers)
        text = response.text
        html = etree.HTML(text)
        position_name = html.xpath('//span[@class="name"]/text()')[0]
        job_request_spans = html.xpath('//dd[@class="job_request"]//span')
        salary_span = job_request_spans[0]
        salary = salary_span.xpath('.//text()')[0].strip()
        city = job_request_spans[1].xpath('.//text()')[0].strip()
        city = re.sub(r'[s/]', '', city, re.S)
        work_years = job_request_spans[2].xpath('.//text()')[0].strip()
        work_years = re.sub(r'[s/]', '', work_years, re.S)
        education = job_request_spans[3].xpath('.//text()')[0].strip()
        education = re.sub(r'[s/]', '', education, re.S)
        desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
        position={
            'position_name':position_name,
            'salary':salary,
            'city':city,
            'work_years':work_years,
            'education':education,
            'desc':desc,
        }
        print(position)
    
    def main():
        request_list_page()
    
    
    if __name__ == '__main__':
        main()
    View Code

    12.实战:爬取拉勾网

    #  -*-coding:utf8 -*-
    from selenium import webdriver
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from lxml import etree
    from selenium.webdriver.common.by import By
    import re
    import time
    
    
    class LagouSpider(object):
        driver_path = r'D:chromedriverchromedriver'
    
        def __init__(self):
            self.driver = webdriver.Chrome(executable_path=LagouSpider.driver_path)
            self.url = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
            self.positions = []
    
        def run(self):
            self.driver.get(self.url)
            while True:
                # self.driver.page_source拿到的是整个网页源代码信息,ajax请求的数据也能看到
                source = self.driver.page_source
                WebDriverWait(driver=self.driver, timeout=10).until(
                    #这个等待加载的XPATH不要加text(),否则会异常
                    EC.presence_of_element_located((By.XPATH, '//div[@class="pager_container"]/span[last()]'))
                )
                self.parse_list_page(source)
                try:
                    # 一页爬完后,点击下一页,继续爬取
                    next_btn = self.driver.find_element_by_xpath('//div[@class="pager_container"]/span[last()]')
                    # 判断一下,下一页是否可以点击,因为最后一页的时候,下一页是无法点击的
                    if "pager_next_disabled" in next_btn.get_attribute('class'):
                        pass
                    else:
                        next_btn.click()
                        time.sleep(7)
                except Exception:
                    print(source)
    
        def parse_list_page(self, source):
            html = etree.HTML(source)
            links = html.xpath('//a[@class="position_link"]/@href')
            for link in links:
                # 当前的页面一个一个的请求
                # 设置请求频率
                self.request_detail_page(link)
                time.sleep(10)
    
        def request_detail_page(self, url):
            # 切换窗口,打开新的页面
            self.driver.execute_script("window.open('%s')" % url)
            # 切换到新的窗口上去
            self.driver.switch_to.window(self.driver.window_handles[1])
            source = self.driver.page_source
            WebDriverWait(self.driver, timeout=10).until(
                EC.presence_of_element_located((By.XPATH, '//span[@class="name"]'))
            )
            self.parse_detail_page(source)
            # close()是关闭当前页面
            self.driver.close()
            # 切换回职位列表页
            self.driver.switch_to.window(self.driver.window_handles[0])
    
        def parse_detail_page(self, source):
            html = etree.HTML(source)
            position_name = html.xpath('//span[@class="name"]/text()')[0]
            job_request_spans = html.xpath('//dd[@class="job_request"]//span')
            salary_span = job_request_spans[0]
            salary = salary_span.xpath('.//text()')[0].strip()
            city = job_request_spans[1].xpath('.//text()')[0].strip()
            city = re.sub(r'[s/]', '', city, re.S)
            work_years = job_request_spans[2].xpath('.//text()')[0].strip()
            work_years = re.sub(r'[s/]', '', work_years, re.S)
            education = job_request_spans[3].xpath('.//text()')[0].strip()
            education = re.sub(r'[s/]', '', education, re.S)
            desc = "".join(html.xpath("//dd[@class='job_bt']//text()")).strip()
            company_name=html.xpath("//h2[@class='fl']/text()")[0].strip()
            position = {
                'name': position_name,
                'company_name':company_name,
                'salary': salary,
                'city': city,
                'work_years': work_years,
                'education': education,
                'desc': desc
            }
            self.positions.append(position)
            print(position)
            print('=' * 40)
    
    
    if __name__ == '__main__':
        spider = LagouSpider()
        spider.run()
        print(spider.positions)
    View Code
  • 相关阅读:
    python中的__init__
    python中的单例模式
    python中闭包和装饰器
    sql多表查询
    configurationChanges
    excludeFromRecents标签
    activity-alias
    meta-data
    launchMode
    Apache ant 配置
  • 原文地址:https://www.cnblogs.com/xufengnian/p/10788208.html
Copyright © 2011-2022 走看看