429 Too Many Requests (太多请求)
当你需要限制客户端请求某个服务的数量,也就是限制请求速度时,该状态码就会非常有用。在此之前,有一些类似的状态码。例如“509 Bandwidth Limit Exceeded”。
如果你希望限制客户端对服务的请求数,可使用 429 状态码,同时包含一个 Retry-After 响应头用于告诉客户端多长时间后可以再次请求服务。
middlewares.py # 当状态码是429的时候 爬虫暂停60秒 在爬取
from scrapy import signals import time from scrapy.downloadermiddlewares.retry import RetryMiddleware from scrapy.utils.response import response_status_message class TooManyRequestsRetryMiddleware(RetryMiddleware): def __init__(self, crawler): super(TooManyRequestsRetryMiddleware, self).__init__(crawler.settings) self.crawler = crawler @classmethod def from_crawler(cls, crawler): return cls(crawler) def process_response(self, request, response, spider): if request.meta.get('dont_retry', False): return response elif response.status == 429: self.crawler.engine.pause() print("速度太快 暂停60秒") time.sleep(60) # If the rate limit is renewed in a minute, put 60 seconds, and so on. self.crawler.engine.unpause() reason = response_status_message(response.status) return self._retry(request, reason, spider) or response elif response.status in self.retry_http_codes: reason = response_status_message(response.status) return self._retry(request, reason, spider) or response return response
settings.py
DOWNLOADER_MIDDLEWARES = { # 开启暂停中间件 'steam_market.middlewares.TooManyRequestsRetryMiddleware': 543, } RETRY_HTTP_CODES = [429, 500, 403] # 这个状态重试 import random DOWNLOAD_DELAY = random.random() + random.random() + random.random() RANDOMIZE_DOWNLOAD_DELAY = True # 发完一个请求 随机暂停一下 在发下一个请求