extensions.py文件
# -*- coding: utf-8 -*-
# 该扩展会在以下事件时记录一条日志:
# spider被打开
# spider被关闭
# 爬取了特定数量的条目(items)
import logging
from collections import defaultdict
from scrapy import signals
from scrapy.exceptions import NotConfigured
from datetime import datetime
logger = logging.getLogger(__name__)
class SpiderOpenCloseLogging(object):
def __init__(self, item_count):
self.item_count = item_count
self.items_scraped = 0
self.items_dropped = 0
self.stats = defaultdict(int) # 默认是0 正常状态
self.err_stats = defaultdict(int) # 默认是0
print("=="*20, 'Extension object created 扩展对象被创建')
@classmethod
def from_crawler(cls, crawler):
# first check if the extension should be enabled and raise
# NotConfigured otherwise
# 关键:这里如果是False就直接放弃对象的创建了,在settings中写一个MYEXT_ENABLED,设置为True
if not crawler.settings.getbool('MYEXT_ENABLED'):
raise NotConfigured
# get the number of items from settings
# 默认每爬1000条才记录一次log,可以在settings中设置这个MYEXT_ITEMCOUNT数字
item_count = crawler.settings.getint('MYEXT_ITEMCOUNT', 1000)
# instantiate the extension object
ext = cls(item_count)
# connect the extension object to signals
# 把ext.spider_opened这个函数绑定到signal=signals.spider_opened这个信号上,
# 每当一个item对象被yield出来的时候,这个信号就会产生
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
# signals.item_scraped这个是主要的信号,前提是一个item被爬之后,并通过所有的Pipeline没有被drop掉
crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)
# 注册一个item_dropped信号,当item被drop之后这个信号会触发
crawler.signals.connect(ext.item_dropped, signal=signals.item_dropped)
# 注册一个ext.response_received
crawler.signals.connect(ext.response_received, signal=signals.response_received)
# return the extension object
return ext
def spider_opened(self, spider):
# spider.log("opened spider %s" % spider.name)
# 可以把spider.log替换成print
print("opened spider %s" % spider.name)
def spider_closed(self, spider):
# spider.log("closed spider %s" % spider.name)
# 可以把spider.log替换成print
print("closed spider %s" % spider.name)
def item_scraped(self, item, spider):
self.items_scraped += 1
if self.items_scraped % self.item_count == 0:
# spider.log("scraped %d items" % self.items_scraped)
# 可以把spider.log替换成print
print("scraped %d items" % self.items_scraped)
def item_dropped(self, item, spider, response, exception):
self.items_dropped += 1
if self.items_dropped % self.item_count == 0:
# spider.log("scraped %d items" % self.items_scraped)
print("dropped %d items" % self.items_dropped)
def response_received(self, response, request, spider): # 监控爬虫的健康情况
# 统计当前这一分钟正确状态和错误状态的数量
now = datetime.now().strftime('%Y%m%d%H%M')
self.stats[now] += 1 # 正常状态+!
if response.status in [401, 403, 404, 500, 501, 502]:
self.err_stats[now] += 1 # 错误状态+1
if self.err_stats[now] / float(self.stats[now]) > 0.2: # 占比
# 一般线上部署有warning信息会发邮件,有err信息会发短信
# warning级别比err低,但是比info高
logger.warning(f'received {self.stats[now]} response and {self.err_stats[now]} of them is not 200,{now}')
settings中配置文件
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
MYEXT_ENABLED = True # 使用自定义插件
MYEXT_ITEMCOUNT = 10 # 每爬10条打印一次或者记录一次日志
EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
'qianmu.extensions.SpiderOpenCloseLogging': 1,
}