zoukankan      html  css  js  c++  java
  • XPATH应用

      1 # -*- coding:utf-8 -*-
      2 '''
      3 Created on Sep 10, 2018
      4 
      5 @author: SaShuangYiBing
      6 '''
      7 from lxml import etree
      8 
      9 html='''
     10 <html>
     11     <head>
     12         <title>哈哈测试一下</title>
     13         <link type="text/css" rel="stylesheet" href="haha.css" />
     14         <link type="text/css" rel="stylesheet" href="haha1.css" />
     15         <link type="text/css" rel="stylesheet" href="haha2.css" />
     16         <script type="text/javascript" src="haha.js"></script>
     17         <script type="text/javascript" src="haha1.js"></script>
     18         <script type="text/javascript" src="haha2.js"></script>
     19     </head>
     20     <body>
     21         <div id="id1" class="class1">
     22             <div id="id2" class="class2">
     23                 <ul class="cls_ul1">
     24                     <li class="cls_li1">
     25                         <div class="cls_3">
     26                             <span>span_text1</span>
     27                             <span>span_text2</span>
     28                             <i>text_1</i>
     29                         </div>
     30                         <div>
     31                             <a href="a_1.html">a_1</a>
     32                             <a href="a_2.html">a_2</a>
     33                             <a href="a_3.html">a_3</a>
     34                         </div>
     35                         <div class="cls_4">
     36                             <a href="a_4.html">
     37                                 <img href="a_img1.jpg" />
     38                             </a>
     39                         </div>
     40                     </li>
     41                     <li class="cls_li1">
     42                         <div class="cls_3">
     43                             <span>span_text3</span>
     44                             <span>span_text4</span>
     45                             <i>text_2</i>
     46                             <i>text_22</i>
     47                         </div>
     48                         <div>
     49                             <a href="a_4.html">a_4</a>
     50                             <a href="a_5.html">a_5</a>
     51                             <a href="a_6.html">a_6</a>
     52                         </div>
     53                         <div class="cls_4">
     54                             <a href="a_5.html">
     55                                 <img href="a_img2.jpg" />
     56                             </a>
     57                         </div>
     58                     </li>
     59                 </ul>
     60             </div>
     61             <div id="id3" class="class3">
     62                 <ul class="cls_ul2">
     63                     <li class="cls_li2">
     64                         <div class="cls_5">
     65                             <span>span_text5</span>
     66                             <span>span_text6</span>
     67                             <i>text_3</i>
     68                         </div>
     69                         <div>
     70                             <a href="a_1.html">a_1</a>
     71                             <a href="a_2.html">a_2</a>
     72                             <a href="a_3.html">a_3</a>
     73                         </div>
     74                         <div class="cls_6">
     75                             <a href="a_4.html">
     76                                 <img href="a_img3.jpg" />
     77                             </a>
     78                         </div>
     79                     </li>
     80                     <li class="cls_li2">
     81                         <div class="cls_5">
     82                             <span>span_text7</span>
     83                             <span>span_text8</span>
     84                             <i>text_4</i>
     85                         </div>
     86                         <div>
     87                             <a href="a_4.html">a_4</a>
     88                             <a href="a_5.html">a_5</a>
     89                             <a href="a_6.html">a_6</a>
     90                         </div>
     91                         <div class="cls_6">
     92                             <a href="a_5.html">
     93                                 <img href="a_img4.jpg" />
     94                             </a>
     95                         </div>
     96                     </li>
     97                 </ul>
     98             </div>
     99         </div>
    100     </body>
    101 </html>
    102 '''
    103 
    104 html_data = etree.HTML(html)
    105 
    106 # 1、从根节点开始,沿着XML路径一步一步选择节点,text()表示节点内容
    107 content = html_data.xpath("/html/head/title/text()")
    108 for con in content:
    109     print (con)
    110 print ("~~~~~~~~~这是第一个分隔线~~~~~~~~~")
    111 
    112 # 2、从根节点开始,沿着XML路径一步一步选择节点,text表示节点内容
    113 nodes = html_data.xpath("/html/head/title")
    114 for i in nodes:
    115     print (i.text)
    116 print ("~~~~~~~~~这是第二个分隔线~~~~~~~~~") 
    117    
    118 # 3、从文档中某个节点开始,不考虑此节点位置,text()表示节点内容
    119 content = html_data.xpath("//title/text()")
    120 for con in content:
    121     print (con)
    122 print ("~~~~~~~~~这是第三个分隔线~~~~~~~~~")  
    123   
    124 # 4、获取所有div(html/body/div/div)的id属性值
    125 nodes = html_data.xpath("/html/body/div/div")
    126 for i in range(len(nodes)):
    127     content = nodes[i].xpath("@id")
    128     for con in content:
    129         print (con)
    130 print ("~~~~~~~~~这是第四个分隔线~~~~~~~~~") 
    131    
    132 # 5、body节点下某节点的属性值
    133 content = html_data.xpath("body/div/div[@id= 'id2']/ul/li[1]/div[2]/a/@href")
    134 for con in content:
    135     print (con)
    136 print ("~~~~~~~~~这是第五个分隔线~~~~~~~~~")
    137 
    138 # 6、div[@id='id2']节点下某节点的属性值
    139 content = html_data.xpath("//div[@id = 'id2']/ul/li[1]/div[2]/a/@href")
    140 for con in content:
    141     print (con)
    142 print ("~~~~~~~~~这是第六个分隔线~~~~~~~~~")
    143 
    144 # 7、div[@id='id2']节点下某节点的内容
    145 content = html_data.xpath("//div[@id= 'id2']/ul/li[1]/div[2]/a/text()")
    146 for con in content:
    147     print (con)
    148 print ("~~~~~~~~~这是第七个分隔线~~~~~~~~~")
    149     
    150 # 8、用'*'来匹配任何元素
    151 content = html_data.xpath("*//div[@id = 'id2']/ul/li[1]/div[2]/a/text()")
    152 for con in content:
    153     print (con)
    154 print ("~~~~~~~~~这是第八个分隔线~~~~~~~~~")   
    155 
    156 # 9、选取多个节点
    157 nodes = html_data.xpath("//i|//span")
    158 for i in range(len(nodes)):
    159     print (nodes[i].text)
    160 print ("~~~~~~~~~这是第九个分隔线~~~~~~~~~")     
    161 
    162 # 10、选取所有li节点
    163 nodes = html_data.xpath("//li")
    164 for i in range(len(nodes)):
    165     content = nodes[i].xpath("div/@class") # li节点下所有div节点的class属性值
    166     print (i,'='*5)
    167     for con in content:
    168         print (con)
    169 print ("~~~~~~~~~这是第十个分隔线~~~~~~~~~") 
    170 
    171 # 11、选取所有li节点
    172 nodes = html_data.xpath("//li")
    173 for i in range(len(nodes)):
    174     content = nodes[i].xpath("div[last()]/@class")  # li节点下最后一个div节点的class属性值
    175     print (i, '='*5)
    176     for con in content:
    177         print (con)
    178 print ("~~~~~~~~~这是第十一个分隔线~~~~~~~~~") 
    179 
    180 # 12、这里应用了'..'和'@',其中'..'表示父节点,具体就是上一步(title)的父节点head;'@'表示属性,就是它后面接是属性名,在这里的意思就是属性href的内容
    181 content = html_data.xpath("/html/head/title/../script/@src")
    182 for con in content:
    183     print (con)
    184 print ("~~~~~~~~~这是第十二个分隔线~~~~~~~~~")
    185 
    186 # 13、div[@class='cls_3']的子节点span的兄弟节点i
    187 nodes = html_data.xpath("//div[@class = 'cls_3']/span/following-sibling::i")
    188 for i in range(len(nodes)):
    189     content = nodes[i].xpath("./text()")  # 当前节点内容
    190     for con in content:
    191         print (con)
    192 print ("~~~~~~~~~这是第十三个分隔线~~~~~~~~~")
    193 
    194 # 14、li[@class='cls_li1']后代节点里第一个div的class属性值
    195 content = html_data.xpath("//li[@class = 'cls_li1']/descendant::div[1]/@class")
    196 for con in content:
    197     print (con)
    198 print ("~~~~~~~~~这是第十四个分隔线~~~~~~~~~")
    199 
    200 
    201 # 15、li[@class='cls_li1']后代节点里span的内容
    202 content = html_data.xpath("//li[@class = 'cls_li1']/descendant::span/text()")
    203 for con in content:
    204     print (con)
    205 print ("~~~~~~~~~这是第十五个分隔线~~~~~~~~~")
    206 
    207 # 16、用'*'来匹配任何元素,且不包含class属性的div节点
    208 content = html_data.xpath("*//div[@id = 'id2']/ul/li[1]/div[not(@class)]/a/text()")
    209 for con in content:
    210     print (con)
    211 print ("~~~~~~~~~这是第十六个分隔线~~~~~~~~~")   
    212 
    213 # 17、多个条件的情况
    214 content = html_data.xpath("//div[@id= 'id2' and @class= 'class2']/ul/li[1]/div[1]/span/text()")
    215 for con in content:
    216     print (con)
    217 print ("~~~~~~~~~这是第十七个分隔线~~~~~~~~~") 
    218 
    219 # 18、contains 包含的情况
    220 content = html_data.xpath("//div[contains(@class,'class2')]/ul/li[2]/div[2]/a/@href")
    221 for con in content:
    222     print (con)
    223 print ("~~~~~~~~~这是第十八个分隔线~~~~~~~~~") 
    224 
    225 输出如下:
    226 
    227 哈哈测试一下
    228 ~~~~~~~~~这是第一个分隔线~~~~~~~~~
    229 哈哈测试一下
    230 ~~~~~~~~~这是第二个分隔线~~~~~~~~~
    231 哈哈测试一下
    232 ~~~~~~~~~这是第三个分隔线~~~~~~~~~
    233 id2
    234 id3
    235 ~~~~~~~~~这是第四个分隔线~~~~~~~~~
    236 a_1.html
    237 a_2.html
    238 a_3.html
    239 ~~~~~~~~~这是第五个分隔线~~~~~~~~~
    240 a_1.html
    241 a_2.html
    242 a_3.html
    243 ~~~~~~~~~这是第六个分隔线~~~~~~~~~
    244 a_1
    245 a_2
    246 a_3
    247 ~~~~~~~~~这是第七个分隔线~~~~~~~~~
    248 a_1
    249 a_2
    250 a_3
    251 ~~~~~~~~~这是第八个分隔线~~~~~~~~~
    252 span_text1
    253 span_text2
    254 text_1
    255 span_text3
    256 span_text4
    257 text_2
    258 text_22
    259 span_text5
    260 span_text6
    261 text_3
    262 span_text7
    263 span_text8
    264 text_4
    265 ~~~~~~~~~这是第九个分隔线~~~~~~~~~
    266 0 =====
    267 cls_3
    268 cls_4
    269 1 =====
    270 cls_3
    271 cls_4
    272 2 =====
    273 cls_5
    274 cls_6
    275 3 =====
    276 cls_5
    277 cls_6
    278 ~~~~~~~~~这是第十个分隔线~~~~~~~~~
    279 0 =====
    280 cls_4
    281 1 =====
    282 cls_4
    283 2 =====
    284 cls_6
    285 3 =====
    286 cls_6
    287 ~~~~~~~~~这是第十一个分隔线~~~~~~~~~
    288 haha.js
    289 haha1.js
    290 haha2.js
    291 ~~~~~~~~~这是第十二个分隔线~~~~~~~~~
    292 text_1
    293 text_2
    294 text_22
    295 ~~~~~~~~~这是第十三个分隔线~~~~~~~~~
    296 cls_3
    297 cls_3
    298 ~~~~~~~~~这是第十四个分隔线~~~~~~~~~
    299 span_text1
    300 span_text2
    301 span_text3
    302 span_text4
    303 ~~~~~~~~~这是第十五个分隔线~~~~~~~~~
    304 a_1
    305 a_2
    306 a_3
    307 ~~~~~~~~~这是第十六个分隔线~~~~~~~~~
    308 span_text1
    309 span_text2
    310 ~~~~~~~~~这是第十七个分隔线~~~~~~~~~
    311 a_4.html
    312 a_5.html
    313 a_6.html
    314 ~~~~~~~~~这是第十八个分隔线~~~~~~~~~
  • 相关阅读:
    PRML 读书记录
    What’s the difference between Taxonomies and Ontologies? Ask Dr. Search
    C#中IP地址转换为数值的方法
    [转]读《简约至上》有感 及我的支语片言
    读《一名毕业生的程序员之路》有感
    [转载]温故知新 javascript 正则表达式
    [转]jQuery 1.9 移除了 $.browser 的替代方法
    QQ网盘首页,这样也能上线!做产品的人是白痴啊!
    [转]P3P解决cookie存取的跨域问题
    【转】跨浏览器“复制到粘贴板”JavaScript代码
  • 原文地址:https://www.cnblogs.com/aziji/p/9674315.html
Copyright © 2011-2022 走看看