1.常用xpath表达式
属性定位:
#找到class属性值为song的div标签
//div[@class="song"]
层级&索引定位:
#找到class属性值为tang的div的直系子标签ul下的第二个子标签li下的直系子标签a
//div[@class="tang"]/ul/li[2]/a
逻辑运算:
#找到href属性值为空且class属性值为du的a标签
//a[@href="" and @class="du"]
模糊匹配:
//div[contains(@class, "ng")]
//div[starts-with(@class, "ta")]
取文本:
# /表示获取某个标签下的文本内容
# //表示获取某个标签下的文本内容和所有子标签下的文本内容
//div[@class="song"]/p[1]/text()
//div[@class="tang"]//text()
取属性:
//div[@class="tang"]//li[2]/a/@href
2.etree对象实例化
- 本地文件:tree = etree.parse(文件名)
tree.xpath("xpath表达式")
- 网络数据:tree = etree.HTML(网页内容字符串)
tree.xpath("xpath表达式")
3.爬虫案例
目标:爬取解析58二手房的相关数据
#解析出一级页面的标题和二级页面的价格和描述
import requests
from lxml import etree
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
url = 'https://bj.58.com/changping/ershoufang/?utm_source=sem-baidu-pc&spm=105916147073.26420796294&PGTID=0d30000c-0000-17fc-4658-9bdfb947934d&ClickID=3'
page_text = requests.get(url=url,headers=headers).text
tree = etree.HTML(page_text)
li_list = tree.xpath('//ul[@class="house-list-wrap"]/li')
data = []
for li in li_list:
#解析标题
title = li.xpath('.//div[@class="list-info"]/h2/a/text()')[0]
detail_page_url = li.xpath('.//div[@class="list-info"]/h2/a/@href')[0]
if detail_page_url.split('//')[0] != 'https:':
detail_page_url = 'https:'+detail_page_url
detail_text = requests.get(url=detail_page_url,headers=headers).text
tree = etree.HTML(detail_text)
#解析二级页面的描述和价格
desc = ''.join(tree.xpath('//div[@id="generalDesc"]//div[@class="general-item-wrap"]//text()')).strip('
')
price = ''.join(tree.xpath('//div[@id="generalExpense"]//div[@class="general-item-wrap"]//text()')).strip('
')
dic = {
'title':title,
'desc':desc,
'price':price
}
data.append(dic)
#进行持久化存储
print(data)