zoukankan      html  css  js  c++  java
  • 解析库--XPath

    from lxml import etree
      2 text = '''
      3 <div>
      4 <ul>
      5 <li class = "item-0"><a herf = "link1.html">first item</a></li>
      6 <li class = "item-1"><a herf = "link2.html">second item</a></li>
      7 <li class = "item-inactive"><a herf = "link3.html">third item</a></li>                                             
      8 <li class = "item-1"><a herf = "link4.html">fourth item</a></li>
      9 <li class = "item-0"><a herf = "link5.html">fifth item</a></li>
     10 </ul>
     11 </div>
     12 '''
     13 html = etree.HTML(text)
     14 result = etree.tostring((html))#输出修正后的HTML文本
     15 code_all = html.xpath("//*")#选取HTML全部的节点
     16 code_li = html.xpath("//li")
     17 code_a = html.xpath("//li/a")#选取HTML的li节点的子节点a
     18 code_p = html.xpath("//a[@herf = 'link4.html']/../@class")#一直子节点寻找父节点的class属性
     19 print(code_p)
     20 print(code_li)
     21 print("///")
     22 print(code_all)
     23 print("///")
     24 print(code_a)
     25 #属性匹配
     26 attribute = html.xpath("//li[@class = 'item-0']")
     27 print(attribute)
     28 #文本获取
    29 text = html.xpath("//li/text()")
     30 print(text)
     31 #属性获取
     32 attribute_get = html.xpath("//li/a/@herf")
     33 print(attribute_get)
     34 #属性多值匹配
     35 text1 = """
     36 <li class = "li li-fist"><a href = "link.html">first item</a></li>
     37 """
     38 html1 = etree.HTML(text1)
     39 attribute_number = html1.xpath("//li[contains(@class,'li')]/a/text()")
     40 print(attribute_number)
     41 #多属性匹配
     42 text2 = """
     43 <li calss = "li li-first" name = "name"><a href = "link.html">first item</a></li>
     44 """
     45 html2 = etree.HTML(text2)
     46 attribute_text2 = html2.xpath("//li[contains(@calss,'li') and @name = 'name']/a/text()")
     47 print(attribute_text2)
     48 #按序选择
     49 """
     50 有时候,我们在选择的时候某些属性可能同时匹配了多个节点,但是只想要其中某个节点
     51 这是可以利用中括号传入索引的方法获取特定次序的节点
     52 """
     53 text3 = '''
     54 <div>
     55 <ul>        
      <li class = "item-0"><a herf = "link1.html">first item</a></li>
     57 <li class = "item-1"><a herf = "link2.html">second item</a></li>
     58 <li class = "item-inactive"><a herf = "link3.html">third item</a></li>
     59 <li class = "item-1"><a herf = "link4.html">fourth item</a></li>
     60 <li class = "item-0"><a herf = "link5.html">fifth item</a></li>
     61 </ul>
     62 </div>
     63 '''
     64 html3 = etree.HTML(text3)
     65 result = html3.xpath("//li[1]/a/text()")#选取第一个li节点
     66 print(result)
     67 result = html3.xpath("//li[last()]/a/text()")#选取左后一个li节点
     68 print(result)
     69 result = html3.xpath("//li[position() < 3]/a/text()")#选取位置小于三的节点
     70 print(result)
     71 #节点轴选取
     72 result = html3.xpath("//li[1]/ancestor::*")#获取所有祖先节点,后跟*表示匹配所有节点
     73 print(result)
     74 result = html3.xpath("//li[1]/ancestor::div")#获取div这个祖先节点
     75 print(result)
     76 result = html3.xpath("//li[1]/attribute::*")#获取所有属性
     77 print(result)
     #运行结果
        ['item-1']
    [<Element li at 0x7f72f489c888>, <Element li at 0x7f72f489c948>, <Element li at 0x7f72f489c9c8>, <Element li at 0x7f72f489ca08>, <Element li at 0x7f72f489ca88>]
    ///
    [<Element html at 0x7f72f489c808>, <Element body at 0x7f72f489c788>, <Element div at 0x7f72f489c748>, <Element ul at 0x7f72f489c848>, <Element li at 0x7f72f489c888>, <Element a at 0x7f72f489c908>, <Element li at 0x7f72f489c948>, <Element a at 0x7f72f489c988>, <Element li at 0x7f72f489c9c8>, <Element a at 0x7f72f489c8c8>, <Element li at 0x7f72f489ca08>, <Element a at 0x7f72f489ca48>, <Element li at 0x7f72f489ca88>, <Element a at 0x7f72f489cac8>]
    ///
    [<Element a at 0x7f72f489c908>, <Element a at 0x7f72f489c988>, <Element a at 0x7f72f489c8c8>, <Element a at 0x7f72f489ca48>, <Element a at 0x7f72f489cac8>]
    [<Element li at 0x7f72f489c888>, <Element li at 0x7f72f489ca88>]
    []
    ['link1.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html']
    ['first item']
    ['first item']
    ['first item']
    ['fifth item']
    ['first item', 'second item']
    [<Element html at 0x7f72f489cdc8>, <Element body at 0x7f72f489cec8>, <Element div at 0x7f72f489cf48>, <Element ul at 0x7f72f489cf08>]
    [<Element div at 0x7f72f489cf48>]
    ['item-0']
           
    笨鸟先飞
  • 相关阅读:
    asyncio异步IO--协程(Coroutine)与任务(Task)详解
    python爬虫实战:利用scrapy,短短50行代码下载整站短视频
    深入理解Git的实现原理
    Upsource 代码审查工具安装及使用
    MAC MAMP集成环境安装 PHP 扩展
    千万数据量数据表分表实践
    设计模式:序言
    设计模式 行为型
    PHP5底层原理之变量
    PHP5底层原理之垃圾回收机制
  • 原文地址:https://www.cnblogs.com/zoutingrong/p/13809730.html
Copyright © 2011-2022 走看看