zoukankan      html  css  js  c++  java
  • scrapy中间件

    一、下载中间件

    from scrapy import signals
    from scrapy.http import Response
    from scrapy.exceptions import IgnoreRequest
    from AMAZON.proxy_handle import get_proxy,delete_proxy
    # print('eeeeeeeeeeee',get_proxy())
    
    class DownMiddleware1(object):
        def process_request(self, request, spider):
            """
            请求需要被下载时,经过所有下载器中间件的process_request调用
            :param request: 
            :param spider: 
            :return:  
                None,继续后续中间件去下载;
                Response对象,停止process_request的执行,开始执行process_response
                Request对象,停止中间件的执行,将Request重新调度器
                raise IgnoreRequest异常,停止process_request的执行,开始执行process_exception
            """
            # spider.name
            print('下载中间件1')
            print('gggggggggggggggggggg',get_proxy())
    
            # request.meta['proxy']='http://user:pwd@ip:port'
            request.meta['download_timeout']=10
            request.meta['proxy']='http://'+get_proxy()
            print(request.meta)
            # return Response('http://www.xxx.com')
            # print(request.dont_filter)
            # return request
            # raise IgnoreRequest
            # raise TimeoutError
    
        def process_response(self, request, response, spider):
            """
            spider处理完成,返回时调用
            :param response:
            :param result:
            :param spider:
            :return: 
                Response 对象:转交给其他中间件process_response
                Request 对象:停止中间件,request会被重新调度下载
                raise IgnoreRequest 异常:调用Request.errback
            """
            print('response1')
            return response
    
        def process_exception(self, request, exception, spider):
            """
            当下载处理器(download handler)或 process_request() (下载中间件)抛出异常
            :param response:
            :param exception:
            :param spider:
            :return: 
                None:继续交给后续中间件处理异常;
                Response对象:停止后续process_exception方法
                Request对象:停止中间件,request将会被重新调用下载
            """
            print('异常1')
            # return None
    
            # 删旧代理 delelte request.meta['proxy']
            old_proxy=request.meta['proxy'].split("//")[-1]
            print('oooooooooooo',old_proxy)
            delete_proxy(old_proxy)
    
            request.meta['proxy']='http://'+get_proxy()
            return request
  • 相关阅读:
    网络信息安全攻防学习平台第7题
    深入理解读写锁ReentrantReadWriteLock
    彻底理解ReentrantLock
    (三)应该了解关于并发相关的概念
    (二)并发编程的优缺点
    Linux Makefile多目录的编写
    libcurl 下载上传
    MFC枚举USB设备碰到的一个疑难,还没解决
    MFC一个令人纠心的错误
    如何为你的App获取用户的反馈和5星级评论
  • 原文地址:https://www.cnblogs.com/ldq1996/p/8342112.html
Copyright © 2011-2022 走看看