zoukankan html css js c++ java

scrapy学习笔记(三)：使用item与pipeline保存数据

scrapy下使用item才是正经方法。
在item中定义需要保存的内容，然后在pipeline处理item，爬虫流程就成了这样：

抓取 --> 按item规则收集需要数据 -->使用pipeline处理（存储等）

定义item,在items.py中定义抓取内容

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class GetquotesItem(scrapy.Item):
    # define the fields for your item here like:
    # 定义我们需要抓取的内容：
    # 1.名言内容
    # 2.作者
    # 3.标签
    content = scrapy.Field()
    author = scrapy.Field()
    tags = scrapy.Field()

我们将数据库的配置信息保存在setting.py文件中，方便调用

MONGODB_HOST = 'localhost'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'store_quotes2'
MONGODB_TABLE = 'quotes2'

另外，在setting.py文件中一点要将pipeline注释去掉，要不然pipeline不会起作用：

#ITEM_PIPELINES = {
#    'getquotes.pipelines.SomePipeline': 300,
#}

改成

ITEM_PIPELINES = {
    'getquotes.pipelines.GetquotesPipeline': 300,
}

现在在pipeline.py中定义处理item方法：

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

# 将setting导入，以使用定义内容
from scrapy.conf import settings
import pymongo

class GetquotesPipeline(object):

    # 连接数据库
    def __init__(self):
        
        # 获取数据库连接信息
        host = settings['MONGODB_HOST']
        port = settings['MONGODB_PORT']
        dbname = settings['MONGODB_DBNAME']
        client = pymongo.MongoClient(host=host, port=port)
        
        # 定义数据库
        db = client[dbname]
        self.table = db[settings['MONGODB_TABLE']]
    
    # 处理item
    def process_item(self, item, spider):
            # 使用dict转换item，然后插入数据库
            quote_info = dict(item)
            self.table.insert(quote_info)
            return item

相应的，myspider.py中的代码变化一下

import scrapy
import pymongo

# 别忘了导入定义的item
from getquotes.items import GetquotesItem

class myspider(scrapy.Spider):

    # 设置爬虫名称
    name = "get_quotes"

    # 设置起始网址
    start_urls = ['http://quotes.toscrape.com']

    '''
        # 配置client，默认地址localhost，端口27017
        client = pymongo.MongoClient('localhost',27017)
        # 创建一个数据库，名称store_quote
        db_name = client['store_quotes']
        # 创建一个表
        quotes_list = db_name['quotes']
    '''
    def parse(self, response):

        #使用 css 选择要素进行抓取，如果喜欢用BeautifulSoup之类的也可以
        #先定位一整块的quote，在这个网页块下进行作者、名言,标签的抓取
        for quote in response.css('.quote'):
            '''
            # 将页面抓取的数据存入mongodb,使用insert
            yield self.quotes_list.insert({
                'author' : quote.css('small.author::text').extract_first(),
                'tags' : quote.css('div.tags a.tag::text').extract(),
                'content' : quote.css('span.text::text').extract_first()
            })
            '''
            item = GetquotesItem()
            item['author'] = quote.css('small.author::text').extract_first()
            item['content'] = quote.css('span.text::text').extract_first()
            item['tags'] = quote.css('div.tags a.tag::text').extract()
            yield item


        # 使用xpath获取next按钮的href属性值
        next_href = response.xpath('//li[@class="next"]/a/@href').extract_first()
        # 判断next_page的值是否存在
        if next_href is not None:

            # 如果下一页属性值存在，则通过urljoin函数组合下一页的url:
            # www.quotes.toscrape.com/page/2
            next_page = response.urljoin(next_href)

            #回调parse处理下一页的url
            yield scrapy.Request(next_page,callback=self.parse)

查看全文

相关阅读:
Interview with BOA
Java Main Differences between HashMap HashTable and ConcurrentHashMap
Java Main Differences between Java and C++
LeetCode 33. Search in Rotated Sorted Array
LeetCode 154. Find Minimum in Rotated Sorted Array II
LeetCode 153. Find Minimum in Rotated Sorted Array
LeetCode 75. Sort Colors
LeetCode 31. Next Permutation
LeetCode 60. Permutation Sequence
LeetCode 216. Combination Sum III

原文地址：https://www.cnblogs.com/brady-wang/p/9699421.html