zoukankan      html  css  js  c++  java
  • 解析库--XPath

    from lxml import etree
      2 text = '''
      3 <div>
      4 <ul>
      5 <li class = "item-0"><a herf = "link1.html">first item</a></li>
      6 <li class = "item-1"><a herf = "link2.html">second item</a></li>
      7 <li class = "item-inactive"><a herf = "link3.html">third item</a></li>                                             
      8 <li class = "item-1"><a herf = "link4.html">fourth item</a></li>
      9 <li class = "item-0"><a herf = "link5.html">fifth item</a></li>
     10 </ul>
     11 </div>
     12 '''
     13 html = etree.HTML(text)
     14 result = etree.tostring((html))#输出修正后的HTML文本
     15 code_all = html.xpath("//*")#选取HTML全部的节点
     16 code_li = html.xpath("//li")
     17 code_a = html.xpath("//li/a")#选取HTML的li节点的子节点a
     18 code_p = html.xpath("//a[@herf = 'link4.html']/../@class")#一直子节点寻找父节点的class属性
     19 print(code_p)
     20 print(code_li)
     21 print("///")
     22 print(code_all)
     23 print("///")
     24 print(code_a)
     25 #属性匹配
     26 attribute = html.xpath("//li[@class = 'item-0']")
     27 print(attribute)
     28 #文本获取
    29 text = html.xpath("//li/text()")
     30 print(text)
     31 #属性获取
     32 attribute_get = html.xpath("//li/a/@herf")
     33 print(attribute_get)
     34 #属性多值匹配
     35 text1 = """
     36 <li class = "li li-fist"><a href = "link.html">first item</a></li>
     37 """
     38 html1 = etree.HTML(text1)
     39 attribute_number = html1.xpath("//li[contains(@class,'li')]/a/text()")
     40 print(attribute_number)
     41 #多属性匹配
     42 text2 = """
     43 <li calss = "li li-first" name = "name"><a href = "link.html">first item</a></li>
     44 """
     45 html2 = etree.HTML(text2)
     46 attribute_text2 = html2.xpath("//li[contains(@calss,'li') and @name = 'name']/a/text()")
     47 print(attribute_text2)
     48 #按序选择
     49 """
     50 有时候,我们在选择的时候某些属性可能同时匹配了多个节点,但是只想要其中某个节点
     51 这是可以利用中括号传入索引的方法获取特定次序的节点
     52 """
     53 text3 = '''
     54 <div>
     55 <ul>        
      <li class = "item-0"><a herf = "link1.html">first item</a></li>
     57 <li class = "item-1"><a herf = "link2.html">second item</a></li>
     58 <li class = "item-inactive"><a herf = "link3.html">third item</a></li>
     59 <li class = "item-1"><a herf = "link4.html">fourth item</a></li>
     60 <li class = "item-0"><a herf = "link5.html">fifth item</a></li>
     61 </ul>
     62 </div>
     63 '''
     64 html3 = etree.HTML(text3)
     65 result = html3.xpath("//li[1]/a/text()")#选取第一个li节点
     66 print(result)
     67 result = html3.xpath("//li[last()]/a/text()")#选取左后一个li节点
     68 print(result)
     69 result = html3.xpath("//li[position() < 3]/a/text()")#选取位置小于三的节点
     70 print(result)
     71 #节点轴选取
     72 result = html3.xpath("//li[1]/ancestor::*")#获取所有祖先节点,后跟*表示匹配所有节点
     73 print(result)
     74 result = html3.xpath("//li[1]/ancestor::div")#获取div这个祖先节点
     75 print(result)
     76 result = html3.xpath("//li[1]/attribute::*")#获取所有属性
     77 print(result)
     #运行结果
        ['item-1']
    [<Element li at 0x7f72f489c888>, <Element li at 0x7f72f489c948>, <Element li at 0x7f72f489c9c8>, <Element li at 0x7f72f489ca08>, <Element li at 0x7f72f489ca88>]
    ///
    [<Element html at 0x7f72f489c808>, <Element body at 0x7f72f489c788>, <Element div at 0x7f72f489c748>, <Element ul at 0x7f72f489c848>, <Element li at 0x7f72f489c888>, <Element a at 0x7f72f489c908>, <Element li at 0x7f72f489c948>, <Element a at 0x7f72f489c988>, <Element li at 0x7f72f489c9c8>, <Element a at 0x7f72f489c8c8>, <Element li at 0x7f72f489ca08>, <Element a at 0x7f72f489ca48>, <Element li at 0x7f72f489ca88>, <Element a at 0x7f72f489cac8>]
    ///
    [<Element a at 0x7f72f489c908>, <Element a at 0x7f72f489c988>, <Element a at 0x7f72f489c8c8>, <Element a at 0x7f72f489ca48>, <Element a at 0x7f72f489cac8>]
    [<Element li at 0x7f72f489c888>, <Element li at 0x7f72f489ca88>]
    []
    ['link1.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html']
    ['first item']
    ['first item']
    ['first item']
    ['fifth item']
    ['first item', 'second item']
    [<Element html at 0x7f72f489cdc8>, <Element body at 0x7f72f489cec8>, <Element div at 0x7f72f489cf48>, <Element ul at 0x7f72f489cf08>]
    [<Element div at 0x7f72f489cf48>]
    ['item-0']
           
    笨鸟先飞
  • 相关阅读:
    leetcode 347. Top K Frequent Elements
    581. Shortest Unsorted Continuous Subarray
    leetcode 3. Longest Substring Without Repeating Characters
    leetcode 217. Contains Duplicate、219. Contains Duplicate II、220. Contains Duplicate、287. Find the Duplicate Number 、442. Find All Duplicates in an Array 、448. Find All Numbers Disappeared in an Array
    leetcode 461. Hamming Distance
    leetcode 19. Remove Nth Node From End of List
    leetcode 100. Same Tree、101. Symmetric Tree
    leetcode 171. Excel Sheet Column Number
    leetcode 242. Valid Anagram
    leetcode 326. Power of Three
  • 原文地址:https://www.cnblogs.com/zoutingrong/p/13809730.html
Copyright © 2011-2022 走看看