免责声明:本文仅供学习学习参考使用,不能用于恶意攻击网站。考虑到安全性以及法律问题本人仅仅提供部分代码以及破解思路。
思路:
首先Boss采用的反爬虫机制是IP封禁,以及所有内容都是动态加载的。既然是动态加载的都需要借助selenium和驱动或者splash。这里我所使用的是selenium。对于IP封禁,就使用代理IP池来解决。但是之前我们搭建的代理IP池是爬取所有代理IP并筛选并没有筛选代理IP的类型,如果在爬取过程中使用了普通代理或者透明代理同样也会被封掉,所以想完完整整的爬取下来所有信息就只能使用付费代理了,既然是作为学习参考使用我就直接使用之前搭建的的代理IP池了。
部分代码如下:
import random from scrapy.http import HtmlResponse from time import sleep import requests import logging class PRoxyMiddleware(): #动态修改IP def __init__(self,proxy_url): self.logger = logging.getLogger(__name__) self.proxies_pool_url = 'http://localhost:5555/random' def get_random_proxy(self): try: response = requests.get(url=self.proxies_pool_url) if response.status_code == 200: proxy = response.text return proxy except requests.exceptions.ConnectionError: return False def process_request(self,request,spider): if request.meta.get('retry_times'): proxy = self.get_random_proxy() if proxy: uri = f'https://{proxy}'.format(proxy=proxy) self.logger.debug('使用代理' + proxy) request.meta['proxy'] = uri @classmethod def from_crawler(cls,crawler): settings = crawler.settings return cls( proxy_url = settings.get('PROXY_URL') ) class BossproDownloaderMiddleware: user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] def process_request(self, request, spider): request.headers['User-Agent'] = random.choice(self.user_agent_list) #UA伪装 return None def process_response(self, request, response, spider): #获取动态加载的数据 bro = spider.bro bro.get(response.url) sleep(2) page_text = bro.page_source # 包含了动态加载的数据 new_response = HtmlResponse(url=request.url, body=page_text, encoding='utf-8', request=request) return new_response def process_exception(self, request, exception, spider): pass