items定义字段名字
import scrapy class HrItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() position = scrapy.Field() pub_date = scrapy.Field()
当怕爬取到数据时
item = HrItem() item['title'] = data.xpath("./td[1]/a/text()").extract_first() item['position'] = data.xpath("./td[2]/text()").extract_first() item['pub_date'] = data.xpath("./td[5]/text()").extract_first()
pipelines储存进mongodb, 需将数据转换成dict
from pymongo import MongoClient client = MongoClient() collection = client['SpiderAnything']['hr'] # 库名 表名 class SpideranythingPipeline(object): def process_item(self, item, spider): if isinstance(item, HrItem): # 判断对象 选择管道 print(item) collection.insert(dict(item)) return item