网上有好多proxy代理,下面的中间件完成scrapy的代理的爬虫
class ProxyMiddleware():
def __init__(self, proxy_url):
self.logger = logging.getLogger(__name__)
self.proxy_url = proxy_url
def get_random_proxy(self):
try:
response = requests.get(self.proxy_url)
if response.status_code == 200:
proxy = response.text
return proxy
except requests.ConnectionError:
return False
def process_request(self, request, spider):
# print(request.meta.get('retry_times'))
# print(request.meta)
# if request.meta.get('retry_times'):
proxy = self.get_random_proxy()
if proxy:
uri = 'https://{proxy}'.format(proxy=proxy)
self.logger.debug('使用代理 ' + proxy)
request.meta['proxy'] = uri
@classmethod
def from_crawler(cls, crawler):
settings = crawler.settings
return cls(
proxy_url=settings.get('PROXY_URL')
)