#xpath语法
1.基础语法:
nodename:节点名定位
/: 根节点
//:任意节点
.:当前节点
./:从当前节点的根节点向下匹配
.//:从当前节点向下的任意位置匹配
例如:nodename[@属性='值']:根据节点的属性值进行定位
属性:根据节点的属性定位
text():获取节点的文本-->'//div[@id='1']/text()'一个值 '//div[@id='1']//text()'加上换行符
@属性:获取节点的属性
2.属性定位:多属性定位 & 单属性多值匹配
2.1 多属性匹配: '//div/div/p[@class="item1" and @name="ptag"]'
2.2 单属性多值匹配: '//div[contains(@class,"item1")]' #去下列两个
<div class="item1 item2"></div>
<div class="item1 item3"></div>
3.按序选择
3.1 索引定位: '//div/ul/li[3]/text()',注意的是索引是从1开始
3.2 last()函数 '//div/ul/li/[last()]/text()',定位最后一个,last()-1 为定位倒数第二个
3.3 position()函数: '//div/ul/li[position()>1]/text()',定位第一个以后的,不包含第一个
4.嵌套选择
经过xpath定位匹配到的节点,还可以再一次进行xpath匹配
li_list = '//div/ul/li' #有10个li
for li in li_list:
aaa = li.xpath('表达式')
#练习1
import requests
from lxml import etree
url = 'https://www.ivsky.com/tupian/katongtupian/'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36',
}
res = requests.get(url=url,headers=headers)
tree = etree.HTML(res.text)
#自己初次写的 //div[@class="sort"]//ul[@class="tpmenu"]/li//a/text()
img_urls = tree.xpath('//ul[@class="tpmenu"]/li/a/text()')
print(img_urls)
for item in img_urls:
with open("tag.txt",'a',encoding='utf-8') as f:
f.write(item)
f.write('
')
#本地练习
from lxml import etree
tree = etree.parse('./xpath_test.html', etree.HTMLParser())
# 1.根据节点名定位节点
# xpath得到的结果是一个列表
ret1 = tree.xpath('//title/text()')
# print(ret1)
# 2.根据属性进行定位
ret2 = tree.xpath('//div[@name="laoda"]/text()')
# print(ret2)
# 3.获取属性
ret3 = tree.xpath('//div[@class="divtag"]/a/@href')
# print(ret3)
# ./ 与 .//
li_list = tree.xpath('//div[@id="008"]/ul/li')
for li in li_list:
title = li.xpath('./a/text()')
price = li.xpath('./span/text()')
img = li.xpath('.//img/@src')
# print(title)
# print(price)
# print('-'*50)
# 多属性匹配
ret4 = tree.xpath('//div[@class="c1" and @name="laoda"]/text()')
# print(ret4)
# 单属性多值匹配
ret5 = tree.xpath('//div[contains(@class, "c1")]/text()')
# print(ret5)
# 按序选择
ret6 = tree.xpath('//div[@class="divtag"]/ul/li[position()>1]/text()')
# print(ret6)
ret7 = tree.xpath('//div[@class="divtag"]/ul/li[position()>2 and position()<5]/text()')
# print(ret7)
ret8 = tree.xpath('//div[@class="divtag"]/ul/li[4]/text()')
print(ret8)