zoukankan      html  css  js  c++  java
  • scrapy pipeline

    pipeline的四个方法

    @classmethod
    def from_crawler(cls, crawler):
        """
        初始化的时候,用以创建pipeline对象
        :param crawler:
        :return:
        """
        pass
    
    
    def open_spider(self, spider):
        """
        爬虫开始执行时,调用
        :param spider:
        :return:
        """
        pass
    
    
    def process_item(self, item, spider):
        """
              每当数据需要持久化时,就会被调用
              :param item:
              :param spider:
              :return:
        """
        
        return item
    
    
    def close_spider(self, spider):
        """
            爬虫结束执行时,调用
            :param spider:
            :return:
        """
        pass

    实例

    import pymysql
    from scrapy.exceptions import DropItem
    
    
    class ChoutiPipeline(object):
        def __init__(self, db_conf):
            self.db_conf = db_conf
            self.conn = None
            self.cursor = None
    
        @classmethod
        def from_crawler(cls, crawler):
            """
            初始化的时候,用以创建pipeline对象
            :param crawler:
            :return:
            """
            db_conf = crawler.settings.get('DATABASE')
            return cls(db_conf)
    
        def open_spider(self, spider):
            """
            爬虫开始执行时,调用
            :param spider:
            :return:
            """
            print('爬虫开始 ...')
            self.conn = pymysql.connect(
                host=self.db_conf['host'],
                port=self.db_conf['port'],
                user=self.db_conf['user'],
                passwd=self.db_conf['password'],
                db=self.db_conf['db'],
                charset=self.db_conf['charset']
            )
            self.cursor = self.conn.cursor()
    
        def process_item(self, item, spider):
            """
                  每当数据需要持久化时,就会被调用
                  :param item:
                  :param spider:
                  :return:
            """
    
            sql = 'INSERT INTO articles(title, title_url, summary, create_time, url_md5)' 
                  ' VALUES ("%s", "%s" ,"%s", "%s", "%s")'
    
            a = sql % (item['title'], item['title_url'], item['summary'], item['create_time'], item['url_md5'])
    
            try:
                self.cursor.execute(a)
                self.conn.commit()
            except Exception as e:
                print(e)
            return DropItem()
    
        def close_spider(self, spider):
            """
                爬虫结束执行时,调用
                :param spider:
                :return:
            """
            self.cursor.close()
            self.conn.close()
            print('爬虫结束 ...')

    注册配置文件

    全局配置:

    # Configure item pipelines
    # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    ITEM_PIPELINES = {
       'day1.pipelines.ChoutiPipeline': 300,
    }

    也可以控制某个爬虫执行那行那个pipeline

    class ChoutiSpider(scrapy.Spider):
        name = 'Chouti'
        allowed_domains = ['dig.chouti.com']
        start_urls = ['https://dig.chouti.com/']
    
        custom_settings = {
            'ITEM_PIPELINES': {'day1.pipelines.ChoutiPipeline': 1}
        }
  • 相关阅读:
    leetcode------Remove Element
    leetcode------Merge Two Sorted Lists
    [转载]Unity3D 游戏引擎之使用C#语言建立本地数据库(SQLITE)
    [转载]VS2012创建MVC3项目提示错误: 此模板尝试加载组件程序集 “NuGet.VisualStudio.Interop, Version=1.0.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a”。
    [转载]Unity3D 访问Access数据库
    [转载]C#导入XLS数据到数据库
    [转载]Unity3D的断点调试功能
    [Unity3D]支持的视频格式
    unity 3d 获取鼠标当前坐标
    [转载]VS2012程序打包部署详解
  • 原文地址:https://www.cnblogs.com/yuqiangli0616/p/9308734.html
Copyright © 2011-2022 走看看