zoukankan      html  css  js  c++  java
  • Scrapy框架: middlewares.py设置

    # -*- coding: utf-8 -*-
    
    # Define here the models for your spider middleware
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
    
    from scrapy import signals
    
    class DownloadtestSpiderMiddleware(object):
        # Not all methods need to be defined. If a method is not defined,
        # scrapy acts as if the spider middleware does not modify the
        # passed objects.
    
        @classmethod
        def from_crawler(cls, crawler):
            # This method is used by Scrapy to create your spiders.
            s = cls()
            crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
            return s
    
        def process_spider_input(self, response, spider):
            # Called for each response that goes through the spider
            # middleware and into the spider.
    
            # Should return None or raise an exception.
            return None
    
        def process_spider_output(self, response, result, spider):
            # Called with the results returned from the Spider, after
            # it has processed the response.
    
            # Must return an iterable of Request, dict or Item objects.
            for i in result:
                yield i
    
        def process_spider_exception(self, response, exception, spider):
            # Called when a spider or process_spider_input() method
            # (from other spider middleware) raises an exception.
    
            # Should return either None or an iterable of Response, dict
            # or Item objects.
            pass
    
        def process_start_requests(self, start_requests, spider):
            # Called with the start requests of the spider, and works
            # similarly to the process_spider_output() method, except
            # that it doesn’t have a response associated.
    
            # Must return only requests (not items).
            for r in start_requests:
                yield r
    
        def spider_opened(self, spider):
            spider.logger.info('Spider opened: %s' % spider.name)
    
    
    class DownloadtestDownloaderMiddleware(object):
        # Not all methods need to be defined. If a method is not defined,
        # scrapy acts as if the downloader middleware does not modify the
        # passed objects.
    
        @classmethod
        def from_crawler(cls, crawler):
            # This method is used by Scrapy to create your spiders.
            s = cls()
            crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
            return s
    
        def process_request(self, request, spider):
            # Called for each request that goes through the downloader
            # middleware.
    
            # Must either:
            # - return None: continue processing this request
            # - or return a Response object
            # - or return a Request object
            # - or raise IgnoreRequest: process_exception() methods of
            #   installed downloader middleware will be called
            return None
    
        def process_response(self, request, response, spider):
            # Called with the response returned from the downloader.
    
            # Must either;
            # - return a Response object
            # - return a Request object
            # - or raise IgnoreRequest
            return response
    
        def process_exception(self, request, exception, spider):
            # Called when a download handler or a process_request()
            # (from other downloader middleware) raises an exception.
    
            # Must either:
            # - return None: continue processing this exception
            # - return a Response object: stops process_exception() chain
            # - return a Request object: stops process_exception() chain
            pass
    
        def spider_opened(self, spider):
            spider.logger.info('Spider opened: %s' % spider.name)
    
    import random
    
    #设置请求头
    class RandomUA():
        def __init__(self):
            self.user_agent=[
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.1 Safari/605.1.16',
                'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0',
                'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)'
            ]
        
        def process_request(self, request, spider):
            request.headers['User-Agent'] = random.choice(self.user_agent)
        
        def process_response(self, request, response, spider):
            response.status=201
            return response
    
    #设置代理
    class ProxyMiddleware:
        proxy_list=[
            "http://127.0.0.1:8080"
            # "http://183.91.33.41:80",
            # "http://111.29.3.190:80",
            # "http://124.156.108.71:82"
        ]
    
        def process_request(self, request, spider):
            ip=random.choice(self.proxy_list)
            request.meta['proxy']=ip
    
  • 相关阅读:
    C#:将空间数据加载到树视图控件
    C# 常见错误
    C#:Application操作(待补充)
    C#:XML操作(简单)
    C#:xml操作(待补充)
    C#:消息框
    C#:数学运算(待补充)
    C#:Ini文件操作(待补充)
    C#:文件操作(待补充)
    2015河南省农村拆迁赔偿流程
  • 原文地址:https://www.cnblogs.com/hankleo/p/11829704.html
Copyright © 2011-2022 走看看