导包
import requests
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
}
res = requests.get(url='https://www.tuli.cc/index.html', headers=headers)
实例化对象
tree = etree.HTML(res.text)
ret = tree.xpath('//*[@id="img-container"]/div[6]/div/div/div[1]/a/img') # 结果为一个列表, 列表中的元素是字符串节点对象
ret_src = ret[0].xpath('./@src')
print(ret_src)
1) /: 根节点
2) //: 任意位置
3) .: 当前节点
4) nodename: 节点名定位, *代表任意节点名
5) 节点属性定位: 节点名[@属性名="divtag "] div[@id="divtag"]
6) 获取节点的属性: @属性名
7) 获取节点文本: text()
单数属性多值匹配 & 多属性匹配
1).单属性多值: <div class="divtag clear"></div><div class="divtag clear item"></div>
contains函数: tree.xpath('//div[contains(@class, "divtag")]')
2).多属性匹配: <div class="divtag" name="item"></div> <div class="divtag"></div>
and关键字: tree.xpath('//div[@class="divtag" and name="item"]')
按序选择:
1).索引定位: 注意一下, xpath的索引从1开始
2).last()函数: 定位最后一个
3).position()函数: 位置函数, 确定节点的位置
案例:
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Xpath练习文件</title>
</head>
<body>
<div id="007">
"我是div标签的文字内容, 和下面的p标签还有div标签是同级的哦"
<p>这是p标签内的文字内容</p>
<div>这是p标签同级的div标签</div>
</div>
<div class="divtag">
<ul>
<li>第1个li标签</li>
<li>第2个li标签</li>
<li>第3个li标签</li>
<li>第4个li标签</li>
<li>第5个li标签</li>
</ul>
<a href="https://www.baidu.com">这是百度的跳转连接</a>
</div>
<div class="c1" name="laoda">老大在此</div>
<div class="c1 c3" name="laoer">老二任性, class有两个值</div>
<div class="c1" name="laosan">我是老三</div>
<div id="112">
<div>
<h1>lsdjfihlfdjofjwjbcl</h1>
<span>就返回使得开发</span>
</div>
<div>
<h1>lsdjfihlfdjofjwjbcl</h1>
<span>你好</span>
</div>
<div>
<h1>lsdjfihlfdjofjwjbcl</h1>
<span>我好</span>
</div>
<div>
<h1>lsdjfihlfdjofjwjbcl</h1>
<span>他好</span>
</div>
<div>
<h1>lsdjfihlfdjofjwjbcl</h1>
<span>都挺好</span>
</div>
</div>
</body>
</html>
from lxml import etree
tree = etree.parse('./xpath_test.html', etree.HTMLParser())
# //为任意位置, 节点名定位
ret1 = tree.xpath('//title/text()')
# 根据节点属性定位
ret2 = tree.xpath('//div[@id="007"]/text()')
# print(ret2)
ret3 = tree.xpath('//div[@id="007"]//text()')
# print(ret3)
# 获取属性
ret4 = tree.xpath('//div[@class="divtag"]/a/@href')
# print(ret4)
# .: ./ .//
# skdfjlksdjf-都挺好
div_list = tree.xpath('//div[@id="112"]/div')
for div in div_list:
first = div.xpath('.//h1/text()')[0]
second = div.xpath('./span/text()')[0]
# print('%s-----%s' % (first, second))
# 单属性多值
ret5 = tree.xpath('//div[contains(@class, "c1")]/text()')
print(ret5)
# 多属性匹配
ret6 = tree.xpath('//div[@class="c1" and @name="laosan"]/text()')
print(ret6)
ret7 = tree.xpath('//div[@class="divtag"]/ul/li[last()-1]/text()')
# print(ret7)
ret8 = tree.xpath('//div[@class="divtag"]/ul/li[position()>2 and position()<5]/text()')
print(ret8)
ret9 = tree.xpath('//div[@class="divtag"]/ul/li[position()<3 or position()>4]/text()')
print(ret9)