zoukankan      html  css  js  c++  java
  • scrapy爬虫-代理IP中间件

    class ProxyDownloaderMiddleware(object):
        # Not all methods need to be defined. If a method is not defined,
        # scrapy acts as if the downloader middleware does not modify the
        # passed objects.
        def __init__(self):
            self.request_proxy_url = ""
            self.IpPool = Queue()  # 维护代理IP池
            self.Ipset = set()  # 记录已经取到的代理IP
            self.request_proxry(number=5)
    
        def request_proxry(self, number=5):
            """
            :param number: the number of getting proxies
            :return:
            """
            if self.IPportQueue.qsize() > 8:
                return
            url = 'https://dps.kdlapi.com/api/getdps/?orderid=987658645908252&num=%d&pt=1&dedup=1&format=json&sep=1' % number
            # lock.acquire()
            r = requests.get(url)
            # lock.release()
            dc = r.json()
            # print(dc,'123')
            for item in dc['data']['proxy_list']:
                if item in self.IPset:
                    continue
                self.IPportQueue.put({'ipport': item, 'useTimes': 0})
                self.IPset.add(item)
                print(item, '+++++++++++++++++')
    
        def get_proxy_ip(self):
            item = self.IpPool.get()
            item["useTimes"] += 1
            if item["useTimes"] > 10:
                self.request_proxry(number=2)
            else:
                self.IpPool.put(item)
            return "https://" + item["ip_port"]
    
        @classmethod
        def from_crawler(cls, crawler):
            # This method is used by Scrapy to create your spiders.
            s = cls()
            crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
            return s
    
        def process_request(self, request, spider):
            # Called for each request that goes through the downloader
            # middleware.
    
            # Must either:
            # - return None: continue processing this request
            # - or return a Response object
            # - or return a Request object
            # - or raise IgnoreRequest: process_exception() methods of
            #   installed downloader middleware will be called
            request.meta["proxy"] = self.get_proxy_ip()
    
            return None
    
        def process_response(self, request, response, spider):
            # Called with the response returned from the downloader.
    
            # Must either;
            # - return a Response object
            # - return a Request object
            # - or raise IgnoreRequest
            return response
    
        def process_exception(self, request, exception, spider):
            # Called when a download handler or a process_request()
            # (from other downloader middleware) raises an exception.
    
            # Must either:
            # - return None: continue processing this exception
            # - return a Response object: stops process_exception() chain
            # - return a Request object: stops process_exception() chain
            pass
    
        def spider_opened(self, spider):
            spider.logger.info('Spider opened: %s' % spider.name)
  • 相关阅读:
    SiteMesh入门(1-1)SiteMesh是什么?
    接口和抽象类有什么区别
    StringUtils工具类常用方法汇总(判空、转换、移除、替换、反转)
    StringUtils工具类常用方法汇总(截取、去除空白、包含、查询索引)
    加密方法与HTTPS 原理详解
    String.split()与StringUtils.split()
    自动生成注释
    linux下安装与部署redis
    mybatis批量保存的两种方式(高效插入)
    pagehelper的使用
  • 原文地址:https://www.cnblogs.com/loveprogramme/p/12070407.html
Copyright © 2011-2022 走看看