zoukankan      html  css  js  c++  java
  • Python 通过lxml遍历html xpath

    #coding:utf-8
    '''
    Created on 2017年10月9日
    
    @author: li.liu
    '''
    from selenium import webdriver
    from lxml import etree
    import urllib
    import urllib2
    import time
    
    #url='http://www.woyihome.com'
    url='http://sso.woyihome.com/sso/pc-login'
    #url='http://www.baidu.com'
    user_agent='Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36'
    values = {'name' : 'WHY',    
              'location' : 'SDU',    
              'language' : 'Python' }    
      
    headers = { 'User-Agent' : user_agent }  
    data = urllib.urlencode(values)    
    req = urllib2.Request(url, data, headers) 
    response = urllib2.urlopen(req)    
    html1= response.read().encode('utf-8')
    
    def test1():
        x1={}
    
        #html1=urllib.urlopen(url).read().decode('utf-8')
        #print html1
        hxml=etree.HTML(html1)
        #print hxml
        htree=etree.ElementTree(hxml)
        #print htree
        id_dite=htree.xpath('//*[@id]')
        #print id_dite
        coun=0
        for id_items in id_dite:
            #print id_items.items()
            #print htree.getpath(id_items)       
            for id_item in id_items.items():
                #print id_item
                if id_item[0]=='id':
                    id_str='//*[@id="'+id_item[1]+'"]'
                    x1[id_str]=[]
                    #print id_str
                    id_path=htree.getpath(htree.xpath(id_str)[0])
                    #print id_path
                    id_str1=id_str+'//*'
                    idelem_list=htree.xpath(id_str1)
                    #print idelem_list
                    for e in idelem_list:
                        if len(e.items())==0:
                            pass
                        else:
                            e_path=htree.getpath(e)
                            #print e_path                   
                            e_path1=e_path.split(id_path)
                            #print e_path1[1]
                            if len(e_path1)>1:
                                e_str=id_str+e_path1[1]
                                e_list=e_str.split('/')
                                if 'li' in e_list[len(e_list)-1] or 'ul' in e_list[len(e_list)-1] or 'span' in e_list[len(e_list)-1]:
                                    pass
                                else:
                                    #print e_str
                                    coun+=1
                                    x1[id_str].append(e_str)
        '''
        for i in x1:
        #print i
            for i1 in x1[i]:
                print i1
            
        '''                                
        a=0
        b=0                            
        driver=webdriver.Chrome()
        driver.get(url)
        #print driver.title                            
        for i in x1:
            #print i
            for i1 in x1[i]:
                #print i1
                try:
                    d=driver.find_element_by_xpath(i1)
                    a+=1
                    print d.text
                    time.sleep(2)
                    driver.find_element_by_xpath(i1).click()
                    headx=driver.window_handles
                    #print headx
                    print '当前页面地址:
    ',driver.current_url
                    time.sleep(1)
                    print i,'
    '
                    if len(headx)!=1:
                        driver.switch_to_window(headx[1])
                        durl= driver.current_url
                        print '当前页面地址:
    ',durl,'
    '
                        if 'woyihome' in durl:
                            driver.close()
                            driver.switch_to_window(headx[0])
                        else:
                            k=1
                            break
                    elif 'localhost' in driver.current_url:
                        
                        print a
                except :
                    pass
                    #print b
        print a        
                
                
                
        #driver.quit()        
                
                
                
                
                
                
                
            #print '===================================================='
                                    
                            
                            
                            
                            
                            
                            
                                
        print coun
                    
                    
                    
                    
                
                
                
                
    test1()
  • 相关阅读:
    密码验证合格程序(Python)
    Python找到所有子集
    Semi-Supervised Classification with Graph Convolutional Networks 阅读笔记
    2018 ICPC南京网络赛 L Magical Girl Haze 题解
    2018 CCPC网络赛 hdu6444 Neko's loop
    2018 CCPC 网络赛 Buy and Resell
    实对称矩阵可对角化证明
    矩阵的极分解证明
    关于欧几里得空间上的仿射变换的直观几何理解
    Codeforces Hello 2018 E题Logical Expression dp+最短路 好题
  • 原文地址:https://www.cnblogs.com/liuliu-word/p/8058078.html
Copyright © 2011-2022 走看看