class ProxyDownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. def __init__(self): self.request_proxy_url = "" self.IpPool = Queue() # 维护代理IP池 self.Ipset = set() # 记录已经取到的代理IP self.request_proxry(number=5) def request_proxry(self, number=5): """ :param number: the number of getting proxies :return: """ if self.IPportQueue.qsize() > 8: return url = 'https://dps.kdlapi.com/api/getdps/?orderid=987658645908252&num=%d&pt=1&dedup=1&format=json&sep=1' % number # lock.acquire() r = requests.get(url) # lock.release() dc = r.json() # print(dc,'123') for item in dc['data']['proxy_list']: if item in self.IPset: continue self.IPportQueue.put({'ipport': item, 'useTimes': 0}) self.IPset.add(item) print(item, '+++++++++++++++++') def get_proxy_ip(self): item = self.IpPool.get() item["useTimes"] += 1 if item["useTimes"] > 10: self.request_proxry(number=2) else: self.IpPool.put(item) return "https://" + item["ip_port"] @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called request.meta["proxy"] = self.get_proxy_ip() return None def process_response(self, request, response, spider): # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name)