zoukankan      html  css  js  c++  java
  • scrapy的扩展件extensions

    extensions.py文件


    #
    -*- coding: utf-8 -*- # 该扩展会在以下事件时记录一条日志: # spider被打开 # spider被关闭 # 爬取了特定数量的条目(items)
    import logging from collections import defaultdict from scrapy import signals from scrapy.exceptions import NotConfigured from datetime import datetime logger = logging.getLogger(__name__) class SpiderOpenCloseLogging(object): def __init__(self, item_count): self.item_count = item_count self.items_scraped = 0 self.items_dropped = 0 self.stats = defaultdict(int) # 默认是0 正常状态 self.err_stats = defaultdict(int) # 默认是0 print("=="*20, 'Extension object created 扩展对象被创建') @classmethod def from_crawler(cls, crawler): # first check if the extension should be enabled and raise # NotConfigured otherwise # 关键:这里如果是False就直接放弃对象的创建了,在settings中写一个MYEXT_ENABLED,设置为True if not crawler.settings.getbool('MYEXT_ENABLED'): raise NotConfigured # get the number of items from settings # 默认每爬1000条才记录一次log,可以在settings中设置这个MYEXT_ITEMCOUNT数字 item_count = crawler.settings.getint('MYEXT_ITEMCOUNT', 1000) # instantiate the extension object ext = cls(item_count) # connect the extension object to signals # 把ext.spider_opened这个函数绑定到signal=signals.spider_opened这个信号上, # 每当一个item对象被yield出来的时候,这个信号就会产生 crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened) crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed) # signals.item_scraped这个是主要的信号,前提是一个item被爬之后,并通过所有的Pipeline没有被drop掉 crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped) # 注册一个item_dropped信号,当item被drop之后这个信号会触发 crawler.signals.connect(ext.item_dropped, signal=signals.item_dropped) # 注册一个ext.response_received crawler.signals.connect(ext.response_received, signal=signals.response_received) # return the extension object return ext def spider_opened(self, spider): # spider.log("opened spider %s" % spider.name) # 可以把spider.log替换成print print("opened spider %s" % spider.name) def spider_closed(self, spider): # spider.log("closed spider %s" % spider.name) # 可以把spider.log替换成print print("closed spider %s" % spider.name) def item_scraped(self, item, spider): self.items_scraped += 1 if self.items_scraped % self.item_count == 0: # spider.log("scraped %d items" % self.items_scraped) # 可以把spider.log替换成print print("scraped %d items" % self.items_scraped) def item_dropped(self, item, spider, response, exception): self.items_dropped += 1 if self.items_dropped % self.item_count == 0: # spider.log("scraped %d items" % self.items_scraped) print("dropped %d items" % self.items_dropped) def response_received(self, response, request, spider): # 监控爬虫的健康情况 # 统计当前这一分钟正确状态和错误状态的数量 now = datetime.now().strftime('%Y%m%d%H%M') self.stats[now] += 1 # 正常状态+! if response.status in [401, 403, 404, 500, 501, 502]: self.err_stats[now] += 1 # 错误状态+1 if self.err_stats[now] / float(self.stats[now]) > 0.2: # 占比 # 一般线上部署有warning信息会发邮件,有err信息会发短信 # warning级别比err低,但是比info高 logger.warning(f'received {self.stats[now]} response and {self.err_stats[now]} of them is not 200,{now}')
    settings中配置文件
    
    # Enable or disable extensions
    # See https://docs.scrapy.org/en/latest/topics/extensions.html
    MYEXT_ENABLED = True  # 使用自定义插件
    MYEXT_ITEMCOUNT = 10  # 每爬10条打印一次或者记录一次日志
    EXTENSIONS = {
       # 'scrapy.extensions.telnet.TelnetConsole': None,
       'qianmu.extensions.SpiderOpenCloseLogging': 1,
    }



  • 相关阅读:
    Tcp连接和断开
    centos7服务器监控之nmon
    Jmeter之命令行生成HTML报告
    Centos7安装配置----1配置网络
    Wireshark抓包笔录--之指定IP地址筛选捕获结果
    关于如何刷新清除某个特定网站的缓存---基于Chrome浏览器
    关于如何清除某个特定网站的缓存---基于Chrome浏览器
    Linux 查看端口机服务
    XSS学习笔记
    安全测试学习笔记
  • 原文地址:https://www.cnblogs.com/kenD/p/12248037.html
Copyright © 2011-2022 走看看