zoukankan      html  css  js  c++  java
  • 第十九节:Scrapy爬虫框架之Middleware文件详解

    # -*- coding: utf-8 -*-

    # 在这里定义蜘蛛中间件的模型
    # Define here the models for your spider middleware
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/spider-middleware.html

    from scrapy import signals

    # ===========================Spider Middleware============================
    # 定义:介于Scrapy引擎和爬虫之间的框架,主要工作是处理蜘蛛的响应输入和请求输出。
    # Spider Middleware功能:处理爬虫的请求输入和响应输出
    # scrapy已经提供了一些直接使用的中间件,他被SPIDER_MIDDLEWARES_BASE定义:
    # {
    # 'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': 50,
    # 'scrapy.spidermiddlewares.offsite.OffsiteMiddleware': 500,
    # 'scrapy.spidermiddlewares.referer.RefererMiddleware': 700,
    # 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': 800,
    # 'scrapy.spidermiddlewares.depth.DepthMiddleware': 900,
    # }

    # =================SpiderMiddleware类==================
    class MaoyanSpiderMiddleware(object):
    @classmethod

    # 类方法,参数crawler,可以通过crawler调用settings里的全局参数
    def from_crawler(cls, crawler):
    """
    :param crawler: 获取settings里的全局参数,如crawler.settings.get(参数)
    """
    s = cls()
    # 调用spider_opened函数进行爬取数据并对该函数发送该信号。该信号一般用来分配spider的资源
    crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)

    # 调用spider_closed函数进行关闭爬虫并对该函数发送该信号。该信号用来释放spider在spider_opened时占用的资源。
    # crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
    return s

    # 当返回来的response被Spider Middleware处理时,该方法被调用
    def process_spider_input(self, response, spider):
    """
    :param response: 被Spider Middleware处理的response对象
    :param spider: 返回response对应的spider对象
    """
    return None

    # 当spider处理response对象的结果后,该方法被调用
    def process_spider_output(self, response, result, spider):
    """
    :param response: 被spider处理后得到结果的response对象
    :param result: result包含Item或request对象的可迭代对象,即spider返回的response结果
    :param spider: 返回response对象的spider对象
    """
    # 遍历返回的可迭代对象
    for i in result:
    yield i

    # 当spider的process_spider_input和process_spider_output发生异常时调用该方法
    def process_spider_exception(self, response, exception, spider):
    """
    :param response: 异常被抛出时被处理的response对象
    :param exception: 抛出的异常
    :param spider: 抛出该异常的spider对象
    """
    pass

    # 以spider启动的request为参数调用该方法,返回一个request可迭代对象
    def process_start_requests(self, start_requests, spider):
    """
    :param start_requests: 开始请求的可迭代对象
    :param spider: 开始请求所对应的spider对象
    """
    # 遍历可迭代对象
    for r in start_requests:
    yield r


    # 当spider开启时调用该函数,说明开始爬取数据并分配spider的资源
    def spider_opened(self, spider):
    """
    :param spider: 开始爬取的spider对象
    """
    spider.logger.info('Spider opened: %s' % spider.name)


    # # 当某个spider被关闭时,说明关闭该爬虫并释放spider在spider_opened时占用的资源。
    # def spider_closed(self, spider):
    # """
    # :param spider: 开始爬取的spider对象
    # """
    # spider.logger.info('Spider opened:%s'%spider.name)




    # ======================Downloader Middleware========================
    # 定义:位于Scrapy引擎和下载器之间的框架,主要是处理Scrapy引擎与下载器之间的请求及响应。见scrapy框架图
    # Downloader Middleware功能:可以修改User-Agent、处理重定向、设置代理、失败重试、设置Cookies等
    # scrapy已经提供了一些直接使用的中间件,他被DOWNLOADER_MIDDLEWARES_BASE定义:
    # {
    # 'scrapy.contrib.downloadermiddleware.robotstxt.RobotsTxtMiddleware': 100,
    # 'scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware': 300,
    # 'scrapy.contrib.downloadermiddleware.downloadtimeout.DownloadTimeoutMiddleware': 350,
    # 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 400,
    # 'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 500,
    # 'scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware': 550,
    # 'scrapy.contrib.downloadermiddleware.redirect.MetaRefreshMiddleware': 580,
    # 'scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware': 590,
    # 'scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware': 600,
    # 'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware': 700,
    # 'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 750,
    # 'scrapy.contrib.downloadermiddleware.chunked.ChunkedTransferMiddleware': 830,
    # 'scrapy.contrib.downloadermiddleware.stats.DownloaderStats': 850,
    # 'scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware': 900,
    # }


    # ===============DownloaderMiddleware类=================
    class MaoyanDownloaderMiddleware(object):
    @classmethod

    # 类方法,参数crawler,可以通过crawler调用settings里的全局参数
    def from_crawler(cls, crawler):
    """
    :param crawler: 获取settings里的全局参数,如crawler.settings.get(参数)
    """
    s = cls()
    # 调用spider_opened函数进行爬取数据并对该函数发送该信号。该信号一般用来分配spider的资源
    crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)

    # 调用spider_closed函数进行关闭爬虫并对该函数发送该信号。该信号用来释放spider在spider_opened时占用的资源。
    # crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
    return s

    # request被scrapy从调度器调度给Downloader Middleware之前调用该方法对request对象进行处理
    def process_request(self, request, spider):
    """
    :param request: 就是scrapy从调度器调度出来的request对象
    :param spider: 就是scrapy调度出来的request对象的spider对象
    """
    return None

    # request对象被Downloader Middleware执行后返回response是才调用该方法对response对象进行处理
    def process_response(self, request, response, spider):
    """
    :param request: 调度出来被Downloader Middleware处理的request对象
    :param response: Downloader Middleware处理request对象返回后的response对象
    :param spider: response返回来的spider对象
    """
    return response

    # 当process_request和process_response发生异常时调用
    def process_exception(self, request, exception, spider):
    """
    :param request: 产生异常的request对象
    :param exception: 抛出的异常对象
    :param spider: 产生异常的request对象的spider对象
    """
    pass

    # 当spider开启时调用该函数,说明开始爬取数据并分配spider的资源
    def spider_opened(self, spider):
    """
    :param spider: 开始爬取的spider对象
    """
    spider.logger.info('Spider opened: %s' % spider.name)


    # # 当某个spider被关闭时,说明关闭该爬虫并释放spider在spider_opened时占用的资源。
    # def spider_closed(self, spider):
    # """
    # :param spider: 开始爬取的spider对象
    # """
    # spider.logger.info('Spider opened: %s' % spider.name)

  • 相关阅读:
    个人理解闭包
    个人理解回调
    在网站中设置一个分享到功能
    在lua中从一个字符串中移除空间源码
    成都实行积分落户
    unity3D客户端框架
    skynet对Windows环境支持的版本:Windows版skynet
    超期羁押
    资料
    媒体电话
  • 原文地址:https://www.cnblogs.com/zhaco/p/10615805.html
Copyright © 2011-2022 走看看