pipeline的四个方法
@classmethod def from_crawler(cls, crawler): """ 初始化的时候,用以创建pipeline对象 :param crawler: :return: """ pass def open_spider(self, spider): """ 爬虫开始执行时,调用 :param spider: :return: """ pass def process_item(self, item, spider): """ 每当数据需要持久化时,就会被调用 :param item: :param spider: :return: """ return item def close_spider(self, spider): """ 爬虫结束执行时,调用 :param spider: :return: """ pass
实例
import pymysql from scrapy.exceptions import DropItem class ChoutiPipeline(object): def __init__(self, db_conf): self.db_conf = db_conf self.conn = None self.cursor = None @classmethod def from_crawler(cls, crawler): """ 初始化的时候,用以创建pipeline对象 :param crawler: :return: """ db_conf = crawler.settings.get('DATABASE') return cls(db_conf) def open_spider(self, spider): """ 爬虫开始执行时,调用 :param spider: :return: """ print('爬虫开始 ...') self.conn = pymysql.connect( host=self.db_conf['host'], port=self.db_conf['port'], user=self.db_conf['user'], passwd=self.db_conf['password'], db=self.db_conf['db'], charset=self.db_conf['charset'] ) self.cursor = self.conn.cursor() def process_item(self, item, spider): """ 每当数据需要持久化时,就会被调用 :param item: :param spider: :return: """ sql = 'INSERT INTO articles(title, title_url, summary, create_time, url_md5)' ' VALUES ("%s", "%s" ,"%s", "%s", "%s")' a = sql % (item['title'], item['title_url'], item['summary'], item['create_time'], item['url_md5']) try: self.cursor.execute(a) self.conn.commit() except Exception as e: print(e) return DropItem() def close_spider(self, spider): """ 爬虫结束执行时,调用 :param spider: :return: """ self.cursor.close() self.conn.close() print('爬虫结束 ...')
注册配置文件
全局配置:
# Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'day1.pipelines.ChoutiPipeline': 300, }
也可以控制某个爬虫执行那行那个pipeline
class ChoutiSpider(scrapy.Spider): name = 'Chouti' allowed_domains = ['dig.chouti.com'] start_urls = ['https://dig.chouti.com/'] custom_settings = { 'ITEM_PIPELINES': {'day1.pipelines.ChoutiPipeline': 1} }