参考:https://cuiqingcai.com/5545.html
XPath
- XML Path Language
- 在XML文檔中查找信息,同樣適用于HTML文檔
- 使用路徑選擇表達式的方式查找信息
XPath常用规则
- nodename:选取次节点的所有子节点
- /:从当前节点选取直接子节点
- //: 从当前节点选取子孙节点
- .: 选取当前节点
- ..: 选取当前节点的父节点
- @: 选取属性
text = ''' <div> <ul> <li class="item-0"><a href="link1.html">first item</a></li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-inactive"><a href="link3.html">third item</a></li> <li class="item-1"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a> </ul> </div> '''
选取所有节点
from lxml import etree selector = etree.HTML(text) result = selector.xpath('//*') print(result)
输出
[<Element html at 0x1761bfd5508>, <Element body at 0x1761bfd5a88>, <Element div at 0x1761bfd5ac8>, <Element ul at 0x1761bfd5b08>, <Element li at 0x1761bfd5e88>, <Element a at 0x1761bfd5f08>, <Element li at 0x1761bfd5f48>, <Element a at 0x1761bfd5f88>, <Element li at 0x1761bfd5fc8>, <Element a at 0x1761bfd5ec8>, <Element li at 0x1761bfdb048>, <Element a at 0x1761bfdb088>, <Element li at 0x1761bfdb0c8>, <Element a at 0x1761bfdb108>]
子节点
from lxml import etree selector = etree.HTML(text) result = selector.xpath('//li/a') print(result)
输出
[<Element a at 0x1761c02dec8>, <Element a at 0x1761c02de88>, <Element a at 0x1761c02df08>, <Element a at 0x1761c02df48>, <Element a at 0x1761c02df88>]
父节点
from lxml import etree selector = etree.HTML(text) result = selector.xpath('//li/..') print(result)
输出
[<Element ul at 0x1761ae7c288>]
属性匹配
from lxml import etree selector = etree.HTML(text) result = selector.xpath('//li[@class="item-0"]') print(result)
输出
[<Element li at 0x1761afe2dc8>, <Element li at 0x1761c067748>]
注:[@class="item-0"]要使用双引号
文本获取
from lxml import etree selector = etree.HTML(text) result1 = selector.xpath('//li[@class="item-0"]/text()') result2 = selector.xpath('//li[@class="item-0"]/a/text()') print(result1) print(result2)
输出
[' '] ['first item', 'fifth item']
注://li[@class="item-0"]/text()得到[' '] 因"/"是获取直接子节点
属性获取
from lxml import etree selector = etree.HTML(text) result = selector.xpath('//li[@class="item-0"]/a/@href') print(result)
输出
['link1.html', 'link5.html']
属性多值匹配
from lxml import etree text1 = ''' <li class="li li-first"><a href="link.html">first item</a></li> ''' selector = etree.HTML(text1) result1 = selector.xpath('//li[@calss="li"]/a/text()') result2 = selector.xpath('//li[contains(@class,"li")]/a/text()') print(result1) print(result2)
输出
[] ['first item']
多属性匹配
from lxml import etree text2 = ''' <li class="li li-first" name="item"><a href="link.html">first item</a></li> ''' selector = etree.HTML(text2) result = selector.xpath('//li[contains(@class,"li") and @name="item"]/a/text()') print(result
输出
['first item']
按序选择
from lxml import etree text = ''' <div> <ul> <li class="item-0"><a href="link1.html">first item</a></li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-inactive"><a href="link3.html">third item</a></li> <li class="item-1"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a> </ul> </div> ''' selector = etree.HTML(text) result1 = selector.xpath('//li[1]/a/text()') print(result1) result2 = selector.xpath('//li[last()]/a/text()') print(result2) result3 = selector.xpath('//li[position()<3]/a/text()') print(result3) result4 = selector.xpath('//li[last()-2]/a/text()') print(result4)
输出
['first item'] ['fifth item'] ['first item', 'second item'] ['third item']
节点轴选择
from lxml import etree text3 = ''' <div> <ul> <li class="item-0"><a href="link1.html"><span>first item</span></a></li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-inactive"><a href="link3.html">third item</a></li> <li class="item-1"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a> </ul> </div> ''' selector = etree.HTML(text3) result1 = selector.xpath('//li[1]/ancestor::*') print(result1) result2 = selector.xpath('//li[1]/ancestor::div') print(result2) result3 = selector.xpath('//li[1]/attribute::*') print(result3) result4 = selector.xpath('//child::a[@href="link1.html"]') print(result4) result5 = selector.xpath('//li[1]/descendant::span') print(result5) result6 = selector.xpath('//li[1]/following::*[2]') print(result6) result7 = selector.xpath('//li[1]/following-sibling::*') print(result7)
输出
[<Element html at 0x1761c02db88>, <Element body at 0x1761c07bf08>, <Element div at 0x1761c078308>, <Element ul at 0x1761c086088>] [<Element div at 0x1761c078308>] ['item-0'] [<Element a at 0x1761c086288>] [<Element span at 0x1761c06e6c8>] [<Element a at 0x1761c06e688>] [<Element li at 0x1761c078b08>, <Element li at 0x1761c078648>, <Element li at 0x1761c0864c8>, <Element li at 0x1761c086448>]