zoukankan      html  css  js  c++  java
  • 各种爬虫管道

    from datetime import datetime
    from scrapy.exporters import JsonItemExporter, CsvItemExporter
    import pymongo
    import redis
    from .settings import REDIS_HOST, REDIS_PORT, MONGO_HOST, MONGO_PORT


    # 数据源的管道
    class AqiDataPipeline(object):
        def process_item(self, item, spider):
            # 记录爬取时间
            item['crawl_time'] = datetime.utcnow()
            # 记录爬虫
            item['spider'] = spider.name
            return item


    # Json的管道
    class AqiJsonPipeline(object):
        def open_spider(self, spider):
            self.file = open("aqi.json", 'wb')
            self.write = JsonItemExporter(self.file)
            self.write.start_exporting()

        def process_item(self, item, spider):
            self.write.export_item(item)
            return item

        def close_spider(self, spider):
            self.write.finish_exporting()
            self.file.close()


    # Csv的管道
    class AqiVscPipeline(object):
        def open_spider(self, spider):
            self.file = open("aqi.csv", 'wb')
            self.write = CsvItemExporter(self.file)
            self.write.start_exporting()

        def process_item(self, item, spider):
            self.write.export_item(item)
            return item

        def close_spider(self, spider):
            self.write.finish_exporting()
            self.file.close()


    # mongodb数据库管道
    class AqiMongoPipeline(object):
        def open_spider(self, spider):
            self.client = pymongo.MongoClient(host=MONGO_HOST, port=MONGO_PORT)
            self.db = self.client['Aqi']
            self.collection = self.db['aqi']

        def process_item(self, item, spider):
            self.collection.insert(dict(item))
            return item

        def close_spider(self, spider):
            self.client.close()


    # redis数据库管道
    class AqiRedisPipeline(object):
        def open_spider(self, spider):
            self.client = redis.Redis(host=REDIS_HOST, port=REDIS_PORT)

        def process_item(self, item, spider):
            self.client.lpush('aqi', dict(item))
            return item

  • 相关阅读:
    使用OpenSSL自建一个HTTPS服务
    工程实践项目中的需求分析建模—问答系统后端
    代码中的软件工程—分析一个命令行菜单小程序
    Git多人项目开发流程演练
    Docker笔记
    Nginx+Gunicorn+Supervisor部署Flask应用
    Python协程之asyncio
    Python类元编程
    搬家到博客园啦
    Spring boot Security 登陆安全配置
  • 原文地址:https://www.cnblogs.com/hanjian200ok/p/9526028.html
Copyright © 2011-2022 走看看