zoukankan      html  css  js  c++  java
  • 爬虫模块之selenium模块

    一 模块的介绍

     selenium模块最开始是一个自动化测试的工具,驱动浏览器完全模拟浏览器自动测试。

    from selenium import webdriver  # 驱动浏览器
    browser=webdriver.Chrome()  # 谷歌浏览器
    browser=webdriver.Firefox()   # 火狐浏览器
    browser=webdriver.PhantomJS()  # 虚拟浏览器
    browser=webdriver.Safari()
    browser=webdriver.Edge() 
    

    二 下载安装

    #安装:selenium+chromedriver
    pip3 install selenium
    下载chromdriver.exe放到python安装路径的scripts目录中即可,注意最新版本是2.29,并非2.9
    国内镜像网站地址:http://npm.taobao.org/mirrors/chromedriver/2.29/
    最新的版本去官网找:https://sites.google.com/a/chromium.org/chromedriver/downloads
    
    #验证安装
    C:UsersAdministrator>python3
    Python 3.6.1 (v3.6.1:69c0db5, Mar 21 2017, 18:41:36) [MSC v.1900 64 bit (AMD64)] on win32
    Type "help", "copyright", "credits" or "license" for more information.
    >>> from selenium import webdriver
    >>> driver=webdriver.Chrome() #弹出浏览器
    >>> driver.get('https://www.baidu.com')
    >>> driver.page_source
    
    #注意:
    selenium3默认支持的webdriver是Firfox,而Firefox需要安装geckodriver
    下载链接:https://github.com/mozilla/geckodriver/releases
    View Code
    #安装:selenium+phantomjs
    pip3 install selenium
    下载phantomjs,解压后把phantomjs.exe所在的bin目录放到环境变量
    下载链接:http://phantomjs.org/download.html
    
    #验证安装
    C:UsersAdministrator>phantomjs
    phantomjs> console.log('egon gaga')
    egon gaga
    undefined
    phantomjs> ^C
    C:UsersAdministrator>python3
    Python 3.6.1 (v3.6.1:69c0db5, Mar 21 2017, 18:41:36) [MSC v.1900 64 bit (AMD64)] on win32
    Type "help", "copyright", "credits" or "license" for more information.
    >>> from selenium import webdriver
    >>> driver=webdriver.PhantomJS() #无界面浏览器
    >>> driver.get('https://www.baidu.com')
    >>> driver.page_source
    View Code

    三 基本使用

     ActionChains:拖动的一些事。

     expected_conditions:加载的时间设置

     find_element_by_id:id查找的方式。

     send_keys:发送查找的关键字

     click:点击事件

     current_url:获取正在驱动的url

     get_cookies:获取cookies信息

     page_source:页面源代码

    from selenium import webdriver
    from selenium.webdriver import ActionChains
    from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
    from selenium.webdriver.common.keys import Keys #键盘按键操作
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
    
    browser=webdriver.Chrome()
    try:
        browser.get('https://www.baidu.com')
    
        input_tag=browser.find_element_by_id('kw')
        input_tag.send_keys('美女') #python2中输入中文错误,字符串前加个u
        input_tag.send_keys(Keys.ENTER) #输入回车
    
        wait=WebDriverWait(browser,10)
        wait.until(EC.presence_of_element_located((By.ID,'content_left'))) #等到id为content_left的元素加载完毕,最多等10秒
    
        print(browser.page_source)
        print(browser.current_url)
        print(browser.get_cookies())
    
    finally:
        browser.close()
    View Code

     四 选择器

     基本选择器查找: 

     find_element_by_id:根据ID查找

     find_element_by_link_text:通过文本查找

     find_element_by_partial_link_text:根据某些文本模糊查找到第一个内容

     find_element_by_class_name:通过class查找

     find_element_by_name:通过name属性查找

      补充:

      presence_of_all_elements_located:相对应的所有元素加载完毕过后

      presence_of_element_located:查找到第一个加载完毕后

      element_to_be_clickable:等待可以点击过后。

      By.CLASS_NAME:class查找的方式

      get_attribute:访问标签的属性

      text:访问文本

      tag_name:访问name

    # from selenium import webdriver
    # from selenium.webdriver import ActionChains
    # from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
    # from selenium.webdriver.common.keys import Keys #键盘按键操作
    # from selenium.webdriver.support import expected_conditions as EC
    # from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
    # import time
    #
    # try:
    #     '''
    #     find_element_by_id
    #     find_element_by_name
    #     find_element_by_link_text
    #     find_element_by_partial_link_text
    #     find_element_by_tag_name
    #     find_element_by_class_name
    #
    #     find_element_by_css_selector
    #     find_element_by_xpath
    #     '''
    #     driver = webdriver.Chrome()
    #     wait=WebDriverWait(driver,3)
    #     driver.get('https://www.baidu.com/')
    #
    #     # 1、find_element_by_id
    #     # input_tag=driver.find_element_by_id('kw')
    #     # print(input_tag.tag_name)
    #     # print(input_tag.get_attribute('name'))
    #     # print(input_tag.text)
    #
    #     # 2、find_element_by_link_text
    #     # login=driver.find_element_by_link_text('登录')
    #     # login.click()
    #
    #     # 3、find_element_by_partial_link_text
    #     login=driver.find_element_by_partial_link_text('登')
    #     login.click()
    #
    #     # 4、find_element_by_class_name
    #     # login_for_user=driver.find_element_by_class_name('tang-pass-footerBarULogin')
    #     # login_for_user=wait.until(EC.presence_of_element_located((By.CLASS_NAME,'tang-pass-footerBarULogin')))
    #     login_for_user=wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'tang-pass-footerBarULogin')))
    #     # print(login_for_user)
    #     login_for_user.click()
    #
    #
    #     #4、find_element_by_name
    #     # input_user=driver.find_element_by_name('userName')
    #     # input_pwd=driver.find_element_by_name('password')
    #     # button=driver.find_element_by_id('TANGRAM__PSP_10__submit')
    #     #
    #     # input_user.send_keys('17094322519')
    #     # input_pwd.send_keys('11111111111')
    #     # button.click()
    #
    #
    #
    #     time.sleep(5)
    # finally:
    #     driver.close()
    View Code

     以上这些只能够查找出来一个内容,如果想要加载相关的所有内容,将有血查找方式的element改成elements就可以了。如下

        find_elements_by_name
        find_elements_by_xpath
        find_elements_by_link_text
        find_elements_by_partial_link_text
        find_elements_by_tag_name
        find_elements_by_class_name
        find_elements_by_css_selector
    

      find_element(s)_by_xpath:如果在没有一个合适定位的方式的时候就可以使用这个

      /:单斜杠,查找一个标签,可以从根标签一层一层的向内部查找。

      //:双斜杠,从当前页面查找出相对用的所有的标签。

      [数字]:确定查找到哪一个标签。

      [@属性=“属性值”]:属性的查找方式

      [locntains(@属性=“属性值的部分内容”)]:属性模糊查找

      //*:所有的标签

      [标签/@属性=“属性值”]:查找有这个标签的标签的值

      ..:两个点,代表的是上一级

    #Xpath选择器
    from selenium import webdriver
    from selenium.webdriver import ActionChains
    from selenium.webdriver.common.by import By  # 按照什么方式查找,By.ID,By.CSS_SELECTOR
    from selenium.webdriver.common.keys import Keys  # 键盘按键操作
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
    import time
    
    try:
        driver = webdriver.Chrome()
        # wait = WebDriverWait(driver, 3)
        driver.implicitly_wait(3)
    
        driver.get('https://doc.scrapy.org/en/latest/_static/selectors-sample1.html')
    
        #1、//与/
        # tag=driver.find_element_by_xpath('/html/body/div/a')
        # print(tag.tag_name)
        # print(tag.text)
        # print(tag.get_attribute('href'))
    
        # tag=driver.find_elements_by_xpath('//a')
        # print(tag)
    
        # tag=driver.find_elements_by_xpath('//div//a')
        # tag=driver.find_elements_by_css_selector('div a')
        # print(len(tag))
    
        #2、查找第几个
        # tag=driver.find_elements_by_xpath('//div//a[5]')
        # print(tag[0].text)
    
        #3、按照属性查找
        # tag1=driver.find_element_by_xpath('//a[@href="image4.html"]')
        # tag2=driver.find_element_by_xpath('//a[4]')
        # tag3=driver.find_element_by_xpath('//a[contains(@href,"image4")]')
        #
        # print(tag1.text)
        # print(tag2.text)
        # print(tag3.text)
    
        #4、其他
        # driver.find_elements_by_xpath('//*[@class="xxxxx"]')
        # driver.find_elements_by_xpath('//div[@class="xxxxx"][@class="yyyyy"]')
    
        # print(driver.find_element_by_xpath('//a[img/@src="image2_thumb.jpg"]').text)
        # print(driver.find_element_by_xpath('//a/..').tag_name)
    
        # print([tag.tag_name for tag in driver.find_elements_by_xpath('//img//..')])
    
        img=driver.find_element_by_xpath('//img')
        print(img.location)
        print(img.size)
    
        time.sleep(5)
    finally:
        driver.close()
    View Code

    五 交互操作

     location:坐标,横:x;竖:y

     size:大小,也就是内容的长宽

     impicitly_wait:隐式等待。

    from selenium import webdriver
    from selenium.webdriver import ActionChains
    from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
    from selenium.webdriver.common.keys import Keys #键盘按键操作
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
    
    browser=webdriver.Chrome()
    
    #隐式等待:在查找所有元素时,如果尚未被加载,则等10秒
    browser.implicitly_wait(10)
    
    browser.get('https://www.baidu.com')
    
    
    input_tag=browser.find_element_by_id('kw')
    input_tag.send_keys('美女')
    input_tag.send_keys(Keys.ENTER)
    
    contents=browser.find_element_by_id('content_left') #没有等待环节而直接查找,找不到则会报错
    print(contents)
    
    browser.close()
    View Code

     显式等待:

    from selenium import webdriver
    from selenium.webdriver import ActionChains
    from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
    from selenium.webdriver.common.keys import Keys #键盘按键操作
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
    
    browser=webdriver.Chrome()
    browser.get('https://www.baidu.com')
    
    
    input_tag=browser.find_element_by_id('kw')
    input_tag.send_keys('美女')
    input_tag.send_keys(Keys.ENTER)
    
    
    #显式等待:显式地等待某个元素被加载
    wait=WebDriverWait(browser,10)
    wait.until(EC.presence_of_element_located((By.ID,'content_left')))
    
    contents=browser.find_element(By.CSS_SELECTOR,'#content_left')
    print(contents)
    
    
    browser.close()
    View Code

     execute_script:直接写js代码

     clear:清空输入框

     iframe:在一个页面中嵌套一个页面

     switch_to.frame:切换到子页面

     awitch_to.parent_frame:切换到父页面

     Action chains(浏览器对象):拖动

     drag_aand_drop(源,目标):从源拖动到目标

     perform():开始执行

     click_and_hold:点击不松手

     move_by_offset:偏移量

     release:松开鼠标

    # from selenium import webdriver
    # from selenium.webdriver import ActionChains
    # from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
    # from selenium.webdriver.common.keys import Keys #键盘按键操作
    # from selenium.webdriver.support import expected_conditions as EC
    # from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
    # import time
    #
    #
    # try:
    #     driver=webdriver.Chrome()
    #     driver.get('https://www.jd.com/')
    #     driver.implicitly_wait(3)
    #
    #     input_tag=driver.find_element_by_id('key')
    #     input_tag.send_keys('iphoneX')
    #     input_tag.send_keys(Keys.ENTER)
    #
    #     time.sleep(3)
    #     input_tag = driver.find_element_by_id('key')
    #     input_tag.clear()
    #     input_tag.send_keys('mac pro')
    #     input_tag.send_keys(Keys.ENTER)
    #
    #
    #     time.sleep(5)
    # finally:
    #     driver.close()
    
    
    
    #ActionChains
    from selenium import webdriver
    from selenium.webdriver import ActionChains
    from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
    from selenium.webdriver.common.keys import Keys #键盘按键操作
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
    import time
    
    
    try:
        driver=webdriver.Chrome()
        driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
        driver.implicitly_wait(3)
        driver.execute_script('alert("hahha")')
    
        # driver.switch_to.frame('iframeResult')
        # driver.switch_to.parent_frame()
    
    
    
        # 方式一:基于一条链,从头一下移动到尾部
        # source = driver.find_element_by_id('draggable')
        # target = driver.find_element_by_id('droppable')
        # actions=ActionChains(driver)
        # actions.drag_and_drop(source,target)
        # actions.perform()
    
        # 方式二:基于不同ActionChains,可以控制移动的位移
        # source = driver.find_element_by_id('draggable')
        # target = driver.find_element_by_id('droppable')
    
        # distance=target.location['x'] - source.location['x']
        #
        # ActionChains(driver).click_and_hold(source).perform()
        # ActionChains(driver).move_by_offset(xoffset=distance,yoffset=0).perform()
        # ActionChains(driver).release().perform()
        #
        # res=0
        # while res < distance:
        #     ActionChains(driver).move_by_offset(xoffset=1,yoffset=0).perform()
        #     res+=1
        # ActionChains(driver).release().perform()
        #
    
    
        time.sleep(5)
    finally:
        driver.close()
    View Code

    六 浏览器的前进和后退:

     back:后退

     forword:前进

    #浏览器的前进后退
    # import time
    # from selenium import webdriver
    #
    # browser=webdriver.Chrome()
    # browser.get('https://www.baidu.com')
    # browser.get('https://www.taobao.com')
    # browser.get('http://www.python.org/')
    #
    # time.sleep(3)
    # browser.back()
    # time.sleep(3)
    # browser.forward()
    # browser.close()
    View Code

    七 cookies

     get_cookies:获取cookies里面的信息。

    #cookies
    from selenium import webdriver
    
    browser=webdriver.Chrome()
    browser.get('https://www.zhihu.com/explore')
    print(browser.get_cookies())
    browser.add_cookie({'k1':'xxx','k2':'yyy'})
    print(browser.get_cookies())
    
    # browser.delete_all_cookies()
    View Code

    八 异常处理

    from selenium import webdriver
    from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException
    
    try:
        browser=webdriver.Chrome()
        browser.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
        browser.switch_to.frame('iframssseResult')
    
    except TimeoutException as e:
        print(e)
    except NoSuchFrameException as e:
        print(e)
    finally:
        browser.close()
    View Code

    九 选项卡管理

    #选项卡管理:切换选项卡,有js的方式windows.open,有windows快捷键:ctrl+t等,最通用的就是js的方式
    import time
    from selenium import webdriver
    
    browser=webdriver.Chrome()
    browser.get('https://www.baidu.com')
    browser.execute_script('window.open()')
    
    print(browser.window_handles) #获取所有的选项卡
    browser.switch_to.window(browser.window_handles[1])
    browser.get('https://www.taobao.com')
    time.sleep(10)
    browser.switch_to.window(browser.window_handles[0])
    browser.get('https://www.sina.com.cn')
    browser.close()
    View Code

    十 练习

    #注意:网站都策略都是在不断变化的,精髓在于学习流程。下述代码生效与2017-11-7,不能保证永久有效
    from selenium import webdriver
    from selenium.webdriver import ActionChains
    from selenium.webdriver.common.by import By
    from selenium.webdriver.common.keys import Keys
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.wait import WebDriverWait
    
    browser=webdriver.Chrome()
    
    try:
        browser.get('http://mail.163.com/')
    
        wait=WebDriverWait(browser,5)
    
        frame=wait.until(EC.presence_of_element_located((By.ID,'x-URS-iframe')))
        browser.switch_to.frame(frame)
    
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'.m-container')))
    
        inp_user=browser.find_element_by_name('email')
        inp_pwd=browser.find_element_by_name('password')
        button=browser.find_element_by_id('dologin')
        inp_user.send_keys('18611453110')
        inp_pwd.send_keys('xxxx')
        button.click()
    
        #如果遇到验证码,可以把下面一小段打开注释
        # import time
        # time.sleep(10)
        # button = browser.find_element_by_id('dologin')
        # button.click()
    
        wait.until(EC.presence_of_element_located((By.ID,'dvNavTop')))
        write_msg=browser.find_elements_by_css_selector('#dvNavTop li')[1] #获取第二个li标签就是“写信”了
        write_msg.click()
    
    
        wait.until(EC.presence_of_element_located((By.CLASS_NAME,'tH0')))
        recv_man=browser.find_element_by_class_name('nui-editableAddr-ipt')
        title=browser.find_element_by_css_selector('.dG0 .nui-ipt-input')
        recv_man.send_keys('378533872@qq.com')
        title.send_keys('圣旨')
        print(title.tag_name)
    
    
        frame=wait.until(EC.presence_of_element_located((By.CLASS_NAME,'APP-editor-iframe')))
        browser.switch_to.frame(frame)
        body=browser.find_element(By.CSS_SELECTOR,'body')
        body.send_keys('egon很帅,可以加工资了')
    
        browser.switch_to.parent_frame() #切回他爹
        send_button=browser.find_element_by_class_name('nui-toolbar-item')
        send_button.click()
    
        #可以睡时间久一点别让浏览器关掉,看看发送成功没有
        import time
        time.sleep(10000)
    
    except Exception as e:
        print(e)
    finally:
        browser.close()
    View Code
    from selenium import webdriver
    from selenium.webdriver import ActionChains
    from selenium.webdriver.common.by import By #按照什么方式查找,By.ID,By.CSS_SELECTOR
    from selenium.webdriver.common.keys import Keys #键盘按键操作
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.support.wait import WebDriverWait #等待页面加载某些元素
    import time
    
    
    def get_goods(driver):
        try:
            goods=driver.find_elements_by_class_name('gl-item')
    
            for good in goods:
                detail_url=good.find_element_by_tag_name('a').get_attribute('href')
    
                p_name=good.find_element_by_css_selector('.p-name em').text.replace('
    ','')
                price=good.find_element_by_css_selector('.p-price i').text
                p_commit=good.find_element_by_css_selector('.p-commit a').text
    
                msg = '''
                商品 : %s
                链接 : %s
                价钱 :%s
                评论 :%s
                ''' % (p_name,detail_url,price,p_commit)
    
                print(msg,end='
    
    ')
    
    
            button=driver.find_element_by_partial_link_text('下一页')
            button.click()
            time.sleep(1)
            get_goods(driver)
        except Exception:
            pass
    
    def spider(url,keyword):
        driver = webdriver.Chrome()
        driver.get(url)
        driver.implicitly_wait(3)  # 使用隐式等待
        try:
            input_tag=driver.find_element_by_id('key')
            input_tag.send_keys(keyword)
            input_tag.send_keys(Keys.ENTER)
            get_goods(driver)
        finally:
            driver.close()
    
    
    if __name__ == '__main__':
        spider('https://www.jd.com/',keyword='iPhone8手机')
    View Code
    #coding=utf-8
    from selenium import webdriver
    # from selenium import fire
    from selenium.webdriver.common.keys import Keys
    import re
    from selenium.webdriver.support.ui import Select
    
    import time
    from pyquery import PyQuery as pq
    from xlwt import *
    import calendar
    from collections import OrderedDict
    
    
    def openurl(num):
        browser = webdriver.Chrome(executable_path=r"H:chromedriver_win32chromedriver.exe")
        browser.get("https://finsix.com/#section-compatibility")
        html = browser.page_source  # 获取网站源码
        data = browser.page_source  # str() 函数将对象转化为适于人阅读的形式。
    
        lis = [['product','version','title','status','tags','info']]
        re_rule_1 = r'<select class="section-compatibility__manufacturers select2-hidden-accessible" tabindex="-1" aria-hidden="true">(.*?)</select>'
        data_list = re.findall(re_rule_1,data, re.S)
        data = data_list[0]
        re_rule = r'<option value=".*?">(.*?)</option>'
        datalist = re.findall(re_rule, data, re.S)
        print datalist
        for i in range(1, len(datalist)):
            try:
                browser.find_element_by_class_name('select2-selection__arrow').click()
            except :
                browser.find_element_by_class_name('section-compatibility__message__close').click()
                browser.find_element_by_class_name('select2-selection__arrow').click()
            s1 = Select(browser.find_element_by_class_name('section-compatibility__manufacturers'))
            s2 = s1.options[i]
            # s1.select_by_index(i)
            s3= s2.text
            print s3
            s2.click()
            try:
                browser.find_element_by_class_name('select2-selection__arrow').click()
            except :
                browser.find_element_by_class_name('section-compatibility__message__close').click()
                browser.find_element_by_class_name('select2-selection__arrow').click()
                s2.click()
                browser.find_element_by_class_name('select2-selection__arrow').click()
    
            time.sleep(2)
    
            rule1 = r'<select class="section-compatibility__models select2-hidden-accessible" tabindex="-1" aria-hidden="true">(.*?)</select>'
            data = browser.page_source
            if '<select class="section-compatibility__models select2-hidden-accessible" tabindex="-1" aria-hidden="true">' not in data:
                rule1 =  '<select class="section-compatibility__models select2-hidden-accessible" disabled="" tabindex="-1" aria-hidden="true">(.*?)</select>'
            bullish = re.findall(rule1,data , re.S)
            # print len(bullish),bullish
            if len(bullish)>0:
                bullish = bullish[0]
            else:
                print len(bullish), bullish
                lis.append([s3,'','','','',''])
                continue
            re_rule = r'<option value=".*?">(.*?)</option>'
            bullish = re.findall(re_rule, bullish, re.S)
            print bullish
            for j in range(0,len(bullish)):
                btn = browser.find_elements_by_class_name('select2-selection')
                try:
                    btn[1].click()
                except :
                    try:
                        browser.find_element_by_class_name('section-compatibility__message__close').click()
                        btn[1].click()
                    except:
                        # btn[1].click()
                        browser.find_element_by_class_name('section-compatibility__message__close').click()
                        btn[1].click()
                s4=Select(browser.find_element_by_class_name('section-compatibility__models'))
                s5 = s4.options[j]
                s6=s5.text
                print s6
                s5.click()
                try:
                    btn[1].click()
                except:
                    try:
                        browser.find_element_by_class_name('section-compatibility__message__close').click()
                        btn[1].click()
                    except:
                        hdata = browser.page_source
                        try:
                            t = browser.find_element_by_class_name('section-compatibility__message__title').text
                        except:
                            t = ''
                        print t
                        b1 = r'<div class="section-compatibility__message__tip"> <strong>(.*?): </strong><span>(.*?)</span> </div>'
                        bk = re.findall(b1, hdata, re.S)
                        print bk
                        try:
                            status = bk[0][0]
                        except:
                            status = ''
                        try:
                            tag = bk[0][1]
                        except:
                            tag = ''
                        # i1 = r'<div class="section-compatibility__message__body"><p>(.*?)</p>.*?</div>'
                        # info = re.findall(i1,hdata,re.S)
                        try:
                            info = browser.find_element_by_class_name(
                                'section-compatibility__message__body').find_element_by_tag_name('p').text
                        except:
                            info = ''
                        print info
                        lis.append([s3, s6, t, status, tag, info])
                        continue
                hdata = browser.page_source
                try:
                    t = browser.find_element_by_class_name('section-compatibility__message__title').text
                except:
                    t=''
                print t
                b1 =  r'<div class="section-compatibility__message__tip"> <strong>(.*?): </strong><span>(.*?)</span> </div>'
                bk = re.findall(b1,hdata,re.S)
                print bk
                try:
                    status = bk[0][0]
                except:status=''
                try:
                    tag = bk[0][1]
                except:tag=''
                # i1 = r'<div class="section-compatibility__message__body"><p>(.*?)</p>.*?</div>'
                # info = re.findall(i1,hdata,re.S)
                try:
                    info = browser.find_element_by_class_name('section-compatibility__message__body').find_element_by_tag_name('p').text
                except:info=''
                print info
                lis.append([s3,s6,t,status, tag,info])
    
                try:
                    browser.find_element_by_class_name('section-compatibility__message__close').click()
                except:
                    # btn[1].click()
                    try:
                        browser.find_element_by_class_name('section-compatibility__message__close').click()
                    except:
                        try:
                            btn[1].click()
                            s5.click()
                            btn[1].click()
                            browser.find_element_by_class_name('section-compatibility__message__close').click()
                        except:
                            continue
        return lis
    
    def zhizuo(lis):
        file = Workbook(encoding='utf-8')
        table = file.add_sheet('data')
        for i, p in enumerate(lis):
            for j, q in enumerate(p):
                table.write(i, j, q)
        file.save( 'product_info.csv')
        return 'success'
    
    
    url = 'https://www.xuangubao.cn/'
    lis = openurl(3)
    print(lis)
    zhizuo(lis)
    # f=open("F:\text.txt","a")
    # for key,values in  dict.items():
    # f.write((key+"	"))
    # print(key,values)
    # f.close()
    爬取finsix

    破解滑动验证:

     http://www.cnblogs.com/fangjie0410/p/8269219.html

     

  • 相关阅读:
    Python 操控Mysql
    mysql5.7 root密码重置
    pandas合并两个excel到一个excel
    键盘控制
    激活浏览器窗口
    python的xlwings库读写excel操作总结
    python 读取Excel使用xlwing库
    CMDB(资产管理系统) day1
    Vue之简易的留言板功能
    vue之神奇的动态按钮
  • 原文地址:https://www.cnblogs.com/fangjie0410/p/8259558.html
Copyright © 2011-2022 走看看