zoukankan      html  css  js  c++  java
  • 写一个scrapy中间件--ip代理池

    middleware文件


    #
    -*- coding: utf-8 -*- # Define here the models for your spider middleware # See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html import random from scrapy import signals class TutorialDownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) # 创建一个中间件 ip代理池 from collections import defaultdict from scrapy.exceptions import NotConfigured
    class RandomProxyMiddleware(object): def __init__(self, settings): # 第三步 初始化配置和变量 # 在settings中写一个 PROXIES 列表配置 # 从settings中把代理读进来(把环境变量读进来) self.proxies = settings.getlist("PROXIES") self.stats = defaultdict(int) # 默认值是0 统计次数 self.max_failed = 3 # 请求最多不超过3次 @classmethod def from_cralwer(cls, crawler): # 第一步 创建中间件对象 # 首先获取配置 HTTPPROXY_ENABLED 看看是否启用代理, if not crawler.settings.getbool("HTTPPROXY_ENABLED"): # 如果没有启用代理 raise NotConfigured # auth_encoding = crawler.settings.get("HTTPPROXY_AUTH_ENCODING") # 读取配置,这里暂时不用 # 第二步 return cls(crawler.settings) # cls()实际调用的是 init()函数,如果init接受参数,cls就需要参数 def process_request(self, request, spider): # 第四步 为每个request对象随机分配一个ip代理 # 让这个请求使用代理 初始url不使用代理ip if self.proxies and not request.meta.get("proxy") and request.url not in spider.start_urls: request.meta["proxy"] = random.choice(self.proxies)
          
    def process_response(self, request, response, spider): # 第五步: 请求成功 cur_proxy = request.meta.get('proxy') # 判断是否被对方禁封 if response.status > 400: # 给相应的ip失败次数 +1 self.stats[cur_proxy] += 1 print("当前ip{},第{}次出现错误状态码".format(cur_proxy, self.stats[cur_proxy])) # 当某个ip的失败次数累计到一定数量 if self.stats[cur_proxy] >= self.max_failed: # 当前ip失败超过3次 print("当前状态码是{},代理{}可能被封了".format(response.status, cur_proxy)) # 可以认为该ip被对方封了,从代理池中删除这个ip self.remove_proxy(cur_proxy) del request.meta['proxy'] # 将这个请求重新给调度器,重新下载 return request # 状态码正常的时候,正常返回 return response def process_exception(self, request, exception, spider): # 第五步:请求失败 cur_proxy = request.meta.get('proxy') # 取出当前代理 from twisted.internet.error import ConnectionRefusedError, TimeoutError # 如果本次请求使用了代理,并且网络请求报错,认为这个ip出了问题 if cur_proxy and isinstance(exception, (ConnectionRefusedError, TimeoutError)): print("当前的{}和当前的{}".format(exception, cur_proxy)) self.remove_proxy(cur_proxy) del request.meta['proxy'] # 重新下载这个请求 return request def remove_proxy(self, proxy): if proxy in self.proxies: self.proxies.remove(proxy) print("从代理列表中删除{}".format(proxy))
    settings 文件
    
    # Enable or disable downloader middlewares
    # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
    DOWNLOADER_MIDDLEWARES = {
       'tutorial.middlewares.RandomProxyMiddleware': 749,  # 修改下载优先级数字
    }
  • 相关阅读:
    Civil 3D 二次开发 创建Civil 3D 对象—— 01 —— 创建几何空间点
    Civil 3D 二次开发 创建Civil 3D 对象—— 00 ——
    Civil 3D 二次开发 创建AutoCAD对象—— 01 —— 创建直线
    Civil 3D 二次开发 新建CLR项目出现错误C2143
    Civil 3D 二次开发 创建AutoCAD对象—— 00 ——
    了解AutoCAD对象层次结构 —— 6 ——块表记录
    datepicker97使用
    使用angular 外接 templateUrl,使用ng-include
    angularJs 遮罩
    网上找的有关css兼容问题
  • 原文地址:https://www.cnblogs.com/kenD/p/12243717.html
Copyright © 2011-2022 走看看