zoukankan      html  css  js  c++  java
  • scrapy pipeline

    pipeline的四个方法

    @classmethod
    def from_crawler(cls, crawler):
        """
        初始化的时候,用以创建pipeline对象
        :param crawler:
        :return:
        """
        pass
    
    
    def open_spider(self, spider):
        """
        爬虫开始执行时,调用
        :param spider:
        :return:
        """
        pass
    
    
    def process_item(self, item, spider):
        """
              每当数据需要持久化时,就会被调用
              :param item:
              :param spider:
              :return:
        """
        
        return item
    
    
    def close_spider(self, spider):
        """
            爬虫结束执行时,调用
            :param spider:
            :return:
        """
        pass

    实例

    import pymysql
    from scrapy.exceptions import DropItem
    
    
    class ChoutiPipeline(object):
        def __init__(self, db_conf):
            self.db_conf = db_conf
            self.conn = None
            self.cursor = None
    
        @classmethod
        def from_crawler(cls, crawler):
            """
            初始化的时候,用以创建pipeline对象
            :param crawler:
            :return:
            """
            db_conf = crawler.settings.get('DATABASE')
            return cls(db_conf)
    
        def open_spider(self, spider):
            """
            爬虫开始执行时,调用
            :param spider:
            :return:
            """
            print('爬虫开始 ...')
            self.conn = pymysql.connect(
                host=self.db_conf['host'],
                port=self.db_conf['port'],
                user=self.db_conf['user'],
                passwd=self.db_conf['password'],
                db=self.db_conf['db'],
                charset=self.db_conf['charset']
            )
            self.cursor = self.conn.cursor()
    
        def process_item(self, item, spider):
            """
                  每当数据需要持久化时,就会被调用
                  :param item:
                  :param spider:
                  :return:
            """
    
            sql = 'INSERT INTO articles(title, title_url, summary, create_time, url_md5)' 
                  ' VALUES ("%s", "%s" ,"%s", "%s", "%s")'
    
            a = sql % (item['title'], item['title_url'], item['summary'], item['create_time'], item['url_md5'])
    
            try:
                self.cursor.execute(a)
                self.conn.commit()
            except Exception as e:
                print(e)
            return DropItem()
    
        def close_spider(self, spider):
            """
                爬虫结束执行时,调用
                :param spider:
                :return:
            """
            self.cursor.close()
            self.conn.close()
            print('爬虫结束 ...')

    注册配置文件

    全局配置:

    # Configure item pipelines
    # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
       'day1.pipelines.ChoutiPipeline': 300,
    }

    也可以控制某个爬虫执行那行那个pipeline

    class ChoutiSpider(scrapy.Spider):
        name = 'Chouti'
        allowed_domains = ['dig.chouti.com']
        start_urls = ['https://dig.chouti.com/']
    
        custom_settings = {
            'ITEM_PIPELINES': {'day1.pipelines.ChoutiPipeline': 1}
        }
  • 相关阅读:
    在Python中使用多进程快速处理数据
    深度学习中Embedding层有什么用?
    split("\s+") 和 split(" +") 有什么区别?
    python merge、concat合并数据集
    机器学习中常见的损失函数
    XGBoost、LightGBM的详细对比介绍
    $(function(){})的执行过程分析
    jQuery.extend({...})分析
    jquery核心功能分析
    print打印网页相关
  • 原文地址:https://www.cnblogs.com/yuqiangli0616/p/9308734.html
Copyright © 2011-2022 走看看