zoukankan      html  css  js  c++  java
  • scrapy pipeline

    pipeline的四个方法

    @classmethod
    def from_crawler(cls, crawler):
        """
        初始化的时候,用以创建pipeline对象
        :param crawler:
        :return:
        """
        pass
    
    
    def open_spider(self, spider):
        """
        爬虫开始执行时,调用
        :param spider:
        :return:
        """
        pass
    
    
    def process_item(self, item, spider):
        """
              每当数据需要持久化时,就会被调用
              :param item:
              :param spider:
              :return:
        """
        
        return item
    
    
    def close_spider(self, spider):
        """
            爬虫结束执行时,调用
            :param spider:
            :return:
        """
        pass

    实例

    import pymysql
    from scrapy.exceptions import DropItem
    
    
    class ChoutiPipeline(object):
        def __init__(self, db_conf):
            self.db_conf = db_conf
            self.conn = None
            self.cursor = None
    
        @classmethod
        def from_crawler(cls, crawler):
            """
            初始化的时候,用以创建pipeline对象
            :param crawler:
            :return:
            """
            db_conf = crawler.settings.get('DATABASE')
            return cls(db_conf)
    
        def open_spider(self, spider):
            """
            爬虫开始执行时,调用
            :param spider:
            :return:
            """
            print('爬虫开始 ...')
            self.conn = pymysql.connect(
                host=self.db_conf['host'],
                port=self.db_conf['port'],
                user=self.db_conf['user'],
                passwd=self.db_conf['password'],
                db=self.db_conf['db'],
                charset=self.db_conf['charset']
            )
            self.cursor = self.conn.cursor()
    
        def process_item(self, item, spider):
            """
                  每当数据需要持久化时,就会被调用
                  :param item:
                  :param spider:
                  :return:
            """
    
            sql = 'INSERT INTO articles(title, title_url, summary, create_time, url_md5)' 
                  ' VALUES ("%s", "%s" ,"%s", "%s", "%s")'
    
            a = sql % (item['title'], item['title_url'], item['summary'], item['create_time'], item['url_md5'])
    
            try:
                self.cursor.execute(a)
                self.conn.commit()
            except Exception as e:
                print(e)
            return DropItem()
    
        def close_spider(self, spider):
            """
                爬虫结束执行时,调用
                :param spider:
                :return:
            """
            self.cursor.close()
            self.conn.close()
            print('爬虫结束 ...')

    注册配置文件

    全局配置:

    # Configure item pipelines
    # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
       'day1.pipelines.ChoutiPipeline': 300,
    }

    也可以控制某个爬虫执行那行那个pipeline

    class ChoutiSpider(scrapy.Spider):
        name = 'Chouti'
        allowed_domains = ['dig.chouti.com']
        start_urls = ['https://dig.chouti.com/']
    
        custom_settings = {
            'ITEM_PIPELINES': {'day1.pipelines.ChoutiPipeline': 1}
        }
  • 相关阅读:
    Python算法:推导、递归和规约
    K-means的缺点(优化不仅仅是最小化误差)
    从统计学角度来看深度学习(2):自动编码器和自由能
    从统计学角度来看深度学习(1):递归广义线性模型
    Why are Eight Bits Enough for Deep Neural Networks?
    VCS引起的oracle数据库异常重新启动一例
    赵雅智:service_startService生命周期
    第九章 两种模式的比較
    CSDN Markdown简明教程3-表格和公式
    OpenStack_Swift源代码分析——Object-auditor源代码分析(1)
  • 原文地址:https://www.cnblogs.com/yuqiangli0616/p/9308734.html
Copyright © 2011-2022 走看看