zoukankan      html  css  js  c++  java
  • scrapy爬虫-代理IP中间件

    class ProxyDownloaderMiddleware(object):
        # Not all methods need to be defined. If a method is not defined,
        # scrapy acts as if the downloader middleware does not modify the
        # passed objects.
        def __init__(self):
            self.request_proxy_url = ""
            self.IpPool = Queue()  # 维护代理IP池
            self.Ipset = set()  # 记录已经取到的代理IP
            self.request_proxry(number=5)
    
        def request_proxry(self, number=5):
            """
            :param number: the number of getting proxies
            :return:
            """
            if self.IPportQueue.qsize() > 8:
                return
            url = 'https://dps.kdlapi.com/api/getdps/?orderid=987658645908252&num=%d&pt=1&dedup=1&format=json&sep=1' % number
            # lock.acquire()
            r = requests.get(url)
            # lock.release()
            dc = r.json()
            # print(dc,'123')
            for item in dc['data']['proxy_list']:
                if item in self.IPset:
                    continue
                self.IPportQueue.put({'ipport': item, 'useTimes': 0})
                self.IPset.add(item)
                print(item, '+++++++++++++++++')
    
        def get_proxy_ip(self):
            item = self.IpPool.get()
            item["useTimes"] += 1
            if item["useTimes"] > 10:
                self.request_proxry(number=2)
            else:
                self.IpPool.put(item)
            return "https://" + item["ip_port"]
    
        @classmethod
        def from_crawler(cls, crawler):
            # This method is used by Scrapy to create your spiders.
            s = cls()
            crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
            return s
    
        def process_request(self, request, spider):
            # Called for each request that goes through the downloader
            # middleware.
    
            # Must either:
            # - return None: continue processing this request
            # - or return a Response object
            # - or return a Request object
            # - or raise IgnoreRequest: process_exception() methods of
            #   installed downloader middleware will be called
            request.meta["proxy"] = self.get_proxy_ip()
    
            return None
    
        def process_response(self, request, response, spider):
            # Called with the response returned from the downloader.
    
            # Must either;
            # - return a Response object
            # - return a Request object
            # - or raise IgnoreRequest
            return response
    
        def process_exception(self, request, exception, spider):
            # Called when a download handler or a process_request()
            # (from other downloader middleware) raises an exception.
    
            # Must either:
            # - return None: continue processing this exception
            # - return a Response object: stops process_exception() chain
            # - return a Request object: stops process_exception() chain
            pass
    
        def spider_opened(self, spider):
            spider.logger.info('Spider opened: %s' % spider.name)
  • 相关阅读:
    MySQL_01 常用命令
    32_Go基础(TCP通信)
    oracle查询优化
    Eclipse中自动添加注释(作者,时间)
    java注解的学习
    JqueryeasyUIdatagrid参数之 queryParams
    Eclipse中,打开文件所在文件夹的插件,及设置
    更改Zend Studio/Eclipse代码风格主题
    JAVA中使用File类批量重命名文件及java.io.File的常见用法
    java面试笔试题大全
  • 原文地址:https://www.cnblogs.com/loveprogramme/p/12070407.html
Copyright © 2011-2022 走看看