源码
class DepthMiddleware(object):
def __init__(self, maxdepth, stats, verbose_stats=False, prio=1):
self.maxdepth = maxdepth
self.stats = stats
self.verbose_stats = verbose_stats
self.prio = prio
@classmethod
def from_crawler(cls, crawler):
settings = crawler.settings
maxdepth = settings.getint('DEPTH_LIMIT')
verbose = settings.getbool('DEPTH_STATS_VERBOSE')
prio = settings.getint('DEPTH_PRIORITY')
return cls(maxdepth, crawler.stats, verbose, prio)
def process_spider_output(self, response, result, spider):
def _filter(request):
if isinstance(request, Request):
depth = response.meta['depth'] + 1
request.meta['depth'] = depth
if self.prio:
request.priority -= depth * self.prio
if self.maxdepth and depth > self.maxdepth:
logger.debug(
"Ignoring link (depth > %(maxdepth)d): %(requrl)s ",
{'maxdepth': self.maxdepth, 'requrl': request.url},
extra={'spider': spider}
)
return False
else:
if self.verbose_stats:
self.stats.inc_value('request_depth_count/%s' % depth,
spider=spider)
self.stats.max_value('request_depth_max', depth,
spider=spider)
return True
# base case (depth=0)
if 'depth' not in response.meta:
response.meta['depth'] = 0
if self.verbose_stats:
self.stats.inc_value('request_depth_count/0', spider=spider)
return (r for r in result or () if _filter(r))
配置
DEPTH_LIMIT = 2 深度限制
开启后 有输出 request_depth_0 1 2 3 4 分别收集了多少个
DEPTH_STATS_VERBOSE = True 深度状态收集
DEPTH_PRIORITY = 5 int 涉及到广度优先还是深度优先
深度优先会先爬取2 3 4 深度的
广度优先会先爬取完1的 再爬取2
正数广度优先 优先级越高 越先请求,因为
request.priority -= depth * self.prio 所以 设置为正数的时候,每次优先级减少,越往后面再请求 就变成先请求前面所有的再请求后面的,广度了
负数 深度优先