text # 输出字符串类型
body # 输出字节类型
import sys,os sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
gb18030 兼容性更好。中文,日文,韩文
# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import HtmlXPathSelector
class HuabanSpider(scrapy.Spider):
name = 'huaban'
allowed_domains = ['huaban.com']
start_urls = ['http://huaban.com/']
def parse(self, response):
hxs = HtmlXPathSelector(response)
result = hxs.select("//a[@class='top_promotion']") #对象
result = hxs.select("//a[@class='top_promotion']").extract() #列表[<a>]
result = hxs.select("//a[@class='top_promotion']").extract_one()#拿第一个
result = hxs.select("//a[@class='top_promotion']/@href").extract_one()#拿a标签下的href属性
result = hxs.select("//a[@class='top_promotion']/text()").extract_one()
#拿其文本内容
# 找到class="recommend-imgbox recommend-box的所有标签
# print(response.text)
#recommend-line top_promotion
推荐以下方式
hxs = Selector(response=response).xpath('//a') # print(hxs) # hxs.xpath('//a')
常用方式
response = HtmlResponse(url='http://example.com', body=html,encoding='utf-8') # hxs = HtmlXPathSelector(response) # print(hxs) # hxs = Selector(response=response).xpath('//a') # print(hxs) # hxs = Selector(response=response).xpath('//a[2]') # print(hxs) # hxs = Selector(response=response).xpath('//a[@id]') # print(hxs) # hxs = Selector(response=response).xpath('//a[@id="i1"]') # print(hxs) # hxs = Selector(response=response).xpath('//a[@href="link.html"][@id="i1"]') # print(hxs) # hxs = Selector(response=response).xpath('//a[contains(@href, "link")]') # print(hxs) # hxs = Selector(response=response).xpath('//a[starts-with(@href, "link")]') # print(hxs) # hxs = Selector(response=response).xpath('//a[re:test(@id, "id+")]') # print(hxs) # hxs = Selector(response=response).xpath('//a[re:test(@id, "id+")]/text()').extract() # print(hxs) # hxs = Selector(response=response).xpath('//a[re:test(@id, "id+")]/@href').extract() # print(hxs) # hxs = Selector(response=response).xpath('/html/body/ul/li/a/@href').extract() # print(hxs) # hxs = Selector(response=response).xpath('//body/ul/li/a/@href').extract_first() # print(hxs)
#当前下面去找:./ 或空 或*/
#到对象中循环找用法
# ul_list = Selector(response=response).xpath('//body/ul/li') # for item in ul_list: # v = item.xpath('./a/span') # # 或 # # v = item.xpath('a/span') # # 或 # # v = item.xpath('*/a/span') # print(v)
#拿其文本
item.xpath('./span[@class="price"]/text()')
item.xpath('./span[@class="price"]/text()').extract_first()#拿其第一个文本
item.xpath('div[@class="item_t"]/div[@class="class"]//a/@href').extract_first() #/儿子;//子孙
hxs.xpath('//div[@class="recommend-imgbox recommend-box"]') #对象
hxs.xpath('//div[@class="recommend-imgbox recommend-box"]').extract() #列表
//#默认去整个html中找
一般操作总结
// ---子孙
/ ---儿子
特殊:
item.xpath('//') #从根目录开始
item.xpath('./') #从相对当前位置,儿子中找
item.xpath('.//') #从相对当前位置,子孙中找
item.xpath('a') #从相对当前儿子中找(不加/,也不加点.)
注意:settings.py中设置DEPTH_LIMIT = 1来指定“递归”的层数。
/@属性名
/text()
yield request(url=''xxx,callback=self.parse)