zoukankan      html  css  js  c++  java
  • 第十九节:Scrapy爬虫框架之Middleware文件详解

    # -*- coding: utf-8 -*-

    # 在这里定义蜘蛛中间件的模型
    # Define here the models for your spider middleware
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/spider-middleware.html

    from scrapy import signals

    # ===========================Spider Middleware============================
    # 定义:介于Scrapy引擎和爬虫之间的框架,主要工作是处理蜘蛛的响应输入和请求输出。
    # Spider Middleware功能:处理爬虫的请求输入和响应输出
    # scrapy已经提供了一些直接使用的中间件,他被SPIDER_MIDDLEWARES_BASE定义:
    # {
    # 'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': 50,
    # 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': 500,
    # 'scrapy.spidermiddlewares.referer.RefererMiddleware': 700,
    # 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': 800,
    # 'scrapy.spidermiddlewares.depth.DepthMiddleware': 900,
    # }

    # =================SpiderMiddleware类==================
    class MaoyanSpiderMiddleware(object):
    @classmethod

    # 类方法,参数crawler,可以通过crawler调用settings里的全局参数
    def from_crawler(cls, crawler):
    """
    :param crawler: 获取settings里的全局参数,如crawler.settings.get(参数)
    """
    s = cls()
    # 调用spider_opened函数进行爬取数据并对该函数发送该信号。该信号一般用来分配spider的资源
    crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)

    # 调用spider_closed函数进行关闭爬虫并对该函数发送该信号。该信号用来释放spider在spider_opened时占用的资源。
    # crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
    return s

    # 当返回来的response被Spider Middleware处理时,该方法被调用
    def process_spider_input(self, response, spider):
    """
    :param response: 被Spider Middleware处理的response对象
    :param spider: 返回response对应的spider对象
    """
    return None

    # 当spider处理response对象的结果后,该方法被调用
    def process_spider_output(self, response, result, spider):
    """
    :param response: 被spider处理后得到结果的response对象
    :param result: result包含Item或request对象的可迭代对象,即spider返回的response结果
    :param spider: 返回response对象的spider对象
    """
    # 遍历返回的可迭代对象
    for i in result:
    yield i

    # 当spider的process_spider_input和process_spider_output发生异常时调用该方法
    def process_spider_exception(self, response, exception, spider):
    """
    :param response: 异常被抛出时被处理的response对象
    :param exception: 抛出的异常
    :param spider: 抛出该异常的spider对象
    """
    pass

    # 以spider启动的request为参数调用该方法,返回一个request可迭代对象
    def process_start_requests(self, start_requests, spider):
    """
    :param start_requests: 开始请求的可迭代对象
    :param spider: 开始请求所对应的spider对象
    """
    # 遍历可迭代对象
    for r in start_requests:
    yield r


    # 当spider开启时调用该函数,说明开始爬取数据并分配spider的资源
    def spider_opened(self, spider):
    """
    :param spider: 开始爬取的spider对象
    """
    spider.logger.info('Spider opened: %s' % spider.name)


    # # 当某个spider被关闭时,说明关闭该爬虫并释放spider在spider_opened时占用的资源。
    # def spider_closed(self, spider):
    # """
    # :param spider: 开始爬取的spider对象
    # """
    # spider.logger.info('Spider opened:%s'%spider.name)




    # ======================Downloader Middleware========================
    # 定义:位于Scrapy引擎和下载器之间的框架,主要是处理Scrapy引擎与下载器之间的请求及响应。见scrapy框架图
    # Downloader Middleware功能:可以修改User-Agent、处理重定向、设置代理、失败重试、设置Cookies等
    # scrapy已经提供了一些直接使用的中间件,他被DOWNLOADER_MIDDLEWARES_BASE定义:
    # {
    # 'scrapy.contrib.downloadermiddleware.robotstxt.RobotsTxtMiddleware': 100,
    # 'scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware': 300,
    # 'scrapy.contrib.downloadermiddleware.downloadtimeout.DownloadTimeoutMiddleware': 350,
    # 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 400,
    # 'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 500,
    # 'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': 550,
    # 'scrapy.contrib.downloadermiddleware.redirect.MetaRefreshMiddleware': 580,
    # 'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': 590,
    # 'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware': 600,
    # 'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware': 700,
    # 'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 750,
    # 'scrapy.contrib.downloadermiddleware.chunked.ChunkedTransferMiddleware': 830,
    # 'scrapy.contrib.downloadermiddleware.stats.DownloaderStats': 850,
    # 'scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware': 900,
    # }


    # ===============DownloaderMiddleware类=================
    class MaoyanDownloaderMiddleware(object):
    @classmethod

    # 类方法,参数crawler,可以通过crawler调用settings里的全局参数
    def from_crawler(cls, crawler):
    """
    :param crawler: 获取settings里的全局参数,如crawler.settings.get(参数)
    """
    s = cls()
    # 调用spider_opened函数进行爬取数据并对该函数发送该信号。该信号一般用来分配spider的资源
    crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)

    # 调用spider_closed函数进行关闭爬虫并对该函数发送该信号。该信号用来释放spider在spider_opened时占用的资源。
    # crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
    return s

    # request被scrapy从调度器调度给Downloader Middleware之前调用该方法对request对象进行处理
    def process_request(self, request, spider):
    """
    :param request: 就是scrapy从调度器调度出来的request对象
    :param spider: 就是scrapy调度出来的request对象的spider对象
    """
    return None

    # request对象被Downloader Middleware执行后返回response是才调用该方法对response对象进行处理
    def process_response(self, request, response, spider):
    """
    :param request: 调度出来被Downloader Middleware处理的request对象
    :param response: Downloader Middleware处理request对象返回后的response对象
    :param spider: response返回来的spider对象
    """
    return response

    # 当process_request和process_response发生异常时调用
    def process_exception(self, request, exception, spider):
    """
    :param request: 产生异常的request对象
    :param exception: 抛出的异常对象
    :param spider: 产生异常的request对象的spider对象
    """
    pass

    # 当spider开启时调用该函数,说明开始爬取数据并分配spider的资源
    def spider_opened(self, spider):
    """
    :param spider: 开始爬取的spider对象
    """
    spider.logger.info('Spider opened: %s' % spider.name)


    # # 当某个spider被关闭时,说明关闭该爬虫并释放spider在spider_opened时占用的资源。
    # def spider_closed(self, spider):
    # """
    # :param spider: 开始爬取的spider对象
    # """
    # spider.logger.info('Spider opened: %s' % spider.name)

  • 相关阅读:
    Candy leetcode java
    Trapping Rain Water leetcode java
    Best Time to Buy and Sell Stock III leetcode java
    Best Time to Buy and Sell Stock II leetcode java
    Best Time to Buy and Sell Stock leetcode java
    Maximum Subarray leetcode java
    Word Break II leetcode java
    Word Break leetcode java
    Anagrams leetcode java
    Clone Graph leetcode java(DFS and BFS 基础)
  • 原文地址:https://www.cnblogs.com/zhaco/p/10615805.html
Copyright © 2011-2022 走看看