zoukankan      html  css  js  c++  java
  • Python 通过lxml 解析html页面自动组合xpath实例

    #coding:utf-8
    '''
    @author: li.liu
    '''
    from selenium import webdriver
    from selenium.webdriver.common.action_chains import ActionBuilder, ActionChains
    from lxml import etree
    import urllib
    import time
    import re
    
    
    #url='http://www.baidu.com'
    url='www.woyihome.com'
    driver= webdriver.Chrome()
    driver.get(url)
    web_title=driver.title
    def test1():
        head=driver.current_window_handle
        print driver.current_url
        xpathset=set()
        try:
            html1=urllib.urlopen(url).read().decode('utf-8')
            hetree=etree.HTML(html1)#lxml解析html
            lxml1=etree.ElementTree(hetree)#lxml.etree解析html
            hiter=hetree.iter()#加载到迭代器中
            #print hiter
            str1=''
            
            for t in hiter:#遍历每个元素
                for item in t.items():#遍历每个元素的属性
                    c=0
                    d=0
                    for i in item:#遍历每个属性的名字和值  
                        if i == 'id':#查找属性名为id的元素
                            str1 ='//*[@'+i+'="'+item[c+1]+'"]'##通过id属性值定位达到元素
                            xx=lxml1.xpath(str1)#查找元素
                            #print '
    ',xx
                            lgx=lxml1.getpath(xx[0])#查找元素路径
                            #print lgx
                            s= lxml1.xpath(str1+'//*')#查找子元素生成list列表
                            for s1 in s:#遍历所有属性为str1的子元素
                                #print s1.text
                                #print lxml1.getpath(s1)
                                for ss1 in s1.items():#遍历str1子元素的属性
                                    for sss1 in ss1 :#遍历属性名和值
                                            try:
                                                #print sss1
                                                lgs1=lxml1.getpath(s1)#获取str1子元素s1的路径
                                                path_split=lgs1.split(lgx)[1]#分割子处理元素属性值的字符串
                                                str3=str+path_split#生成xpath
                                                print '
    ',str3
                                                xpathset.add(str3)
    
                                            except:pass
                        #else:   
                            #pass              
                                #print lxml1.getpath(s1)
                            #print i,'
    '
                        c+=1
                        d+=1
                    print '.',
            print '
    '
            '''
            for i in xpathset:
                print i
                               
                try:
                    driver.find_element_by_xpath(i).text
                    driver.find_element_by_xpath(i).click()
                    durll=driver.current_url
                    headx=driver.window_handles
                    #print headx
                    print '当前页面地址:
    ',durll
                    time.sleep(1)
                    print i,'
    '
                    if len(headx)!=1:
                        driver.switch_to_window(headx[1])
                        durl= driver.current_url
                        print '当前页面地址:
    ',durl,'
    '
                        if '101.37.179.183' in durl:
                            driver.close()
                            driver.switch_to_window(headx[0])
                        else:
                            k=1
                            break
                        
                        
                    else:
                        driver.get(url)
                    
                    
                
                
                except:
                    pass
                '''
            print len(xpathset)            
                #print '	'
            
            #driver.get('http://101.37.179.183')
            #print driver.title
            
        finally:
            #driver.quit()
            print '...'
        '''    
        try:
            time.sleep(1)
            #print driver.find_element_by_xpath('//*[@id="wrapper"]'),1
            #print driver.find_element_by_xpath('//*[@id="wrapper"]/div[2]/a[1]')
            driver.find_element_by_xpath(str1)
            time.sleep(50000)
        finally:
            print 3
            driver.quit()        
        '''    
            
    def test2():
        http_dict={}
        durll=''
        http_dict[durll]=[]
        head=driver.current_window_handle
        xpath_dict={}
        xpathset=set()
        #try:
        html1=urllib.urlopen(url).read().decode('utf-8')
        hetree=etree.HTML(html1)#lxml解析html
        lxml1=etree.ElementTree(hetree)#lxml.etree解析html
        hiter=hetree.iter()#加载到迭代器中
        #print hiter
        hid1=lxml1.xpath('//*[@id]')
        hid=lxml1.xpath('//*[@id]//*')
        for t in hid1:
            id_items=t.items()
            print t.items()#打印id属性的元素所有属性
            tpath=lxml1.getpath(t)
            print tpath#打印id属性的元素的路径
            
            
            for id in id_items:
                if 'id' in id[0]:
                    str1='//*[@id="'+id[1]+'"]'
                    xpath_dict[str1]=[]
                    #print xpath_dict
                    print str1
                    str3=str1+'//*'
                    print str3
                    id_list= lxml1.xpath(str3)
                    for idist in id_list:
                        idpath= lxml1.getpath(idist)
                        idxpathlist=idpath.split(tpath)
                        if len(idxpathlist)>1:  
                            id_xpath=str1+idxpathlist[1]
                            xpath_dict[str1].append(id_xpath)
                            #print xpath_dict[str1]
                            #print idxpathlist
                        #else:
                            #print '+++++++++++++++++++++++++++++++++++++++'
                            #print idxpathlist,'stop',len(idxpathlist)
                    print '=============================================='
        cont=0
        k=0
        
        for i in xpath_dict:
            #print xpath_dict[i]
            for t in xpath_dict[i]:
                durll=''
                try:
                    time.sleep(1)
                    elem_text=driver.find_element_by_xpath(t).text
                    driver.find_element_by_xpath(t).click()
                    durll=driver.current_url
                    headx=driver.window_handles
                    #print headx
                    
                    if len(headx)!=1:
                        driver.switch_to_window(headx[1])
                        durll= driver.current_url
                        print '链接元素名:',elem_text
                        print '页面名:',driver.title
                        print '当前页面地址:
    ',durll
                        print t,'
    '
                        if '101.37.179.183' in durll:
                            driver.close()
                            driver.switch_to_window(headx[0])
                        else:
                            k=1
                            break
                    else:
                        if driver.title !=web_title:
                            print '链接元素名:',elem_text
                            print '页面名:',driver.title
                            print '当前页面地址:
    ',durll
                            print t,'
    '
                            driver.back()
                        pass
                    
                except:
                    if k==1 or 'localhost' in durll:
                        pass
                    else:
                        try:
                            print '动态首项xpath:',dict[i][0]
                            elem=driver.find_element_by_xpath(xpath_dict[i][0])
                            ActionChains(driver).move_to_element(elem).perform()
                            time.sleep(1)
                            driver.find_element_by_xpath(t).click()
                            print '当前动态页面地址为:','
    ',driver.current_url
                            print t,'
    '
                            if driver.title !=web_title:
                                t1= '链接元素名:'+elem_text
                                t2= '页面名:'+driver.title
                                t3= '当前页面地址:'+durll
                                print t1,'
    ',t2,'
    ',t3,'
    ',t,'
    '
                                http_dict[durll].append(t1)
                                http_dict[durll].append(t2)
                                http_dict[durll].append(t3)
                                driver.back()
                        except(Exception):
                            pass
                            #print Exception
                
                cont+=1
                print cont   
        
        
        with open('E:/1/http.txt', 'w') as handle:
            for t in http_dict:
                str2=t+''+str(http_dict[t])
                handle.writelines(str2)
            
        
        
        
        
        
        
        
        
        
    test2()
    print '结束'
    #driver.quit()
    
        
  • 相关阅读:
    P4387 P4387 【深基15.习9】验证栈序列
    P1241 括号序列题解
    P2058 海港题解
    P1540 机器翻译题解
    leaflet + react + typescript
    TypeScript中文手册:从 JavaScript 迁移到 TypeScript
    react-esri-leaflet与typescript
    TypeError: Super expression must either be null or a function
    前端库(gis前端库和普通库分开)
    react-leaflet:Module parse failed: Unexpected token (10:41)
  • 原文地址:https://www.cnblogs.com/liuliu-word/p/8058014.html
Copyright © 2011-2022 走看看