zoukankan      html  css  js  c++  java
  • Downloader Middleware

    Downloader Middleware

    源码解析:

     1 # 文件:E:MinicondaLibsite-packagesscrapycoredownloadermiddleware.py
     2 """
     3 Downloader Middleware manager
     4 
     5 See documentation in docs/topics/downloader-middleware.rst
     6 """
     7 import six
     8 
     9 from twisted.internet import defer
    10 
    11 from scrapy.http import Request, Response
    12 from scrapy.middleware import MiddlewareManager
    13 from scrapy.utils.defer import mustbe_deferred
    14 from scrapy.utils.conf import build_component_list
    15 
    16 
    17 class DownloaderMiddlewareManager(MiddlewareManager):
    18 
    19     component_name = 'downloader middleware'
    20 
    21     @classmethod
    22     def _get_mwlist_from_settings(cls, settings):
    23         # 从settings.py或这custom_setting中拿到自定义的Middleware中间件
    24         '''
    25         'DOWNLOADER_MIDDLEWARES': {
    26             'mySpider.middlewares.ProxiesMiddleware': 400,
    27             # SeleniumMiddleware
    28             'mySpider.middlewares.SeleniumMiddleware': 543,
    29             'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
    30         },
    31         '''
    32         return build_component_list(
    33             settings.getwithbase('DOWNLOADER_MIDDLEWARES'))
    34 
    35     # 将所有自定义Middleware中间件的处理函数添加到对应的methods列表中
    36     def _add_middleware(self, mw):
    37         if hasattr(mw, 'process_request'):
    38             self.methods['process_request'].append(mw.process_request)
    39         if hasattr(mw, 'process_response'):
    40             self.methods['process_response'].insert(0, mw.process_response)
    41         if hasattr(mw, 'process_exception'):
    42             self.methods['process_exception'].insert(0, mw.process_exception)
    43 
    44     # 整个下载流程
    45     def download(self, download_func, request, spider):
    46         @defer.inlineCallbacks
    47         def process_request(request):
    48             # 处理request请求,依次经过各个自定义Middleware中间件的process_request方法,前面有加入到list中
    49             for method in self.methods['process_request']:
    50                 response = yield method(request=request, spider=spider)
    51                 assert response is None or isinstance(response, (Response, Request)), 
    52                         'Middleware %s.process_request must return None, Response or Request, got %s' % 
    53                         (six.get_method_self(method).__class__.__name__, response.__class__.__name__)
    54                 # 这是关键地方
    55                 # 如果在某个Middleware中间件的process_request中处理完之后,生成了一个response对象
    56                 # 那么会直接将这个response return 出去,跳出循环,不再处理其他的process_request
    57                 # 之前我们的header,proxy中间件,都只是加个user-agent,加个proxy,并不做任何return值
    58                 # 还需要注意一点:就是这个return的必须是Response对象
    59                 # 后面我们构造的HtmlResponse正是Response的子类对象
    60                 if response:
    61                     defer.returnValue(response)
    62             # 如果在上面的所有process_request中,都没有返回任何Response对象的话
    63             # 最后,会将这个加工过的Request送往download_func,进行下载,返回的就是一个Response对象
    64             # 然后依次经过各个Middleware中间件的process_response方法进行加工,如下
    65             defer.returnValue((yield download_func(request=request,spider=spider)))
    66 
    67         @defer.inlineCallbacks
    68         def process_response(response):
    69             assert response is not None, 'Received None in process_response'
    70             if isinstance(response, Request):
    71                 defer.returnValue(response)
    72 
    73             for method in self.methods['process_response']:
    74                 response = yield method(request=request, response=response,
    75                                         spider=spider)
    76                 assert isinstance(response, (Response, Request)), 
    77                     'Middleware %s.process_response must return Response or Request, got %s' % 
    78                     (six.get_method_self(method).__class__.__name__, type(response))
    79                 if isinstance(response, Request):
    80                     defer.returnValue(response)
    81             defer.returnValue(response)
    82 
    83         @defer.inlineCallbacks
    84         def process_exception(_failure):
    85             exception = _failure.value
    86             for method in self.methods['process_exception']:
    87                 response = yield method(request=request, exception=exception,
    88                                         spider=spider)
    89                 assert response is None or isinstance(response, (Response, Request)), 
    90                     'Middleware %s.process_exception must return None, Response or Request, got %s' % 
    91                     (six.get_method_self(method).__class__.__name__, type(response))
    92                 if response:
    93                     defer.returnValue(response)
    94             defer.returnValue(_failure)
    95 
    96         deferred = mustbe_deferred(process_request, request)
    97         deferred.addErrback(process_exception)
    98         deferred.addCallback(process_response)
    99         return deferred
  • 相关阅读:
    接水果(fruit)
    大融合
    排序(sortb)
    latex公式测试
    次小生成树
    HDU 2973 YAPTCHA (威尔逊定理)
    状压DP概念 及例题(洛谷 P1896 互不侵犯)
    ICPC Asia Nanning 2017 F. The Chosen One (大数、规律、2的k次幂)
    HDU 1074 Doing Homework (状压DP)
    最长上升(不下降)子序列(LIS) 不同求解方法(动规、贪心)
  • 原文地址:https://www.cnblogs.com/guozepingboke/p/10774181.html
Copyright © 2011-2022 走看看