一、下载中间件
from scrapy import signals from scrapy.http import Response from scrapy.exceptions import IgnoreRequest from AMAZON.proxy_handle import get_proxy,delete_proxy # print('eeeeeeeeeeee',get_proxy()) class DownMiddleware1(object): def process_request(self, request, spider): """ 请求需要被下载时,经过所有下载器中间件的process_request调用 :param request: :param spider: :return: None,继续后续中间件去下载; Response对象,停止process_request的执行,开始执行process_response Request对象,停止中间件的执行,将Request重新调度器 raise IgnoreRequest异常,停止process_request的执行,开始执行process_exception """ # spider.name print('下载中间件1') print('gggggggggggggggggggg',get_proxy()) # request.meta['proxy']='http://user:pwd@ip:port' request.meta['download_timeout']=10 request.meta['proxy']='http://'+get_proxy() print(request.meta) # return Response('http://www.xxx.com') # print(request.dont_filter) # return request # raise IgnoreRequest # raise TimeoutError def process_response(self, request, response, spider): """ spider处理完成,返回时调用 :param response: :param result: :param spider: :return: Response 对象:转交给其他中间件process_response Request 对象:停止中间件,request会被重新调度下载 raise IgnoreRequest 异常:调用Request.errback """ print('response1') return response def process_exception(self, request, exception, spider): """ 当下载处理器(download handler)或 process_request() (下载中间件)抛出异常 :param response: :param exception: :param spider: :return: None:继续交给后续中间件处理异常; Response对象:停止后续process_exception方法 Request对象:停止中间件,request将会被重新调用下载 """ print('异常1') # return None # 删旧代理 delelte request.meta['proxy'] old_proxy=request.meta['proxy'].split("//")[-1] print('oooooooooooo',old_proxy) delete_proxy(old_proxy) request.meta['proxy']='http://'+get_proxy() return request