今天心情不好 不想多打字 自己看注释吧
1 from scrapy.http import HtmlResponse 2 from twisted.internet import defer 3 from twisted.internet.error import TimeoutError, DNSLookupError, 4 ConnectionRefusedError, ConnectionDone, ConnectError, 5 ConnectionLost, TCPTimedOutError 6 from twisted.web.client import ResponseFailed 7 from scrapy.core.downloader.handlers.http11 import TunnelError 8 9 10 class ProcessAllExceptionMiddleware(object): 11 ALL_EXCEPTIONS = (defer.TimeoutError, TimeoutError, DNSLookupError, 12 ConnectionRefusedError, ConnectionDone, ConnectError, 13 ConnectionLost, TCPTimedOutError, ResponseFailed, 14 IOError, TunnelError) 15 16 def process_response(self, request, response, spider): 17 # 捕获状态码为40x/50x的response 18 if str(response.status).startswith('4') or str(response.status).startswith('5'): 19 # 随意封装,直接返回response,spider代码中根据url==''来处理response 20 response = HtmlResponse(url=str(response.status), status=200) 21 return response 22 # 其他状态码不处理 23 return response 24 25 def process_exception(self, request, exception, spider): 26 # 捕获几乎所有的异常 27 if isinstance(exception, self.ALL_EXCEPTIONS): 28 # 在日志中打印异常类型 29 print('Got exception: %s' % (exception)) 30 # 随意封装一个response,返回给spider 31 response = HtmlResponse(url='exception') 32 return response 33 # 打印出未捕获到的异常 34 print('not contained exception: %s' % exception)
然后根据返回的url不同就可以在spider中进行各种处理了