下载中间件
"""
# 2大中间件:下载中间件,爬虫中间件
# 1 写在middlewares.py中(名字随便命名)
# 2 配置生效()
SPIDER_MIDDLEWARES = {
'cnblogs_crawl.middlewares.CnblogsCrawlSpiderMiddleware': 543,
}
DOWNLOADER_MIDDLEWARES = {
'cnblogs_crawl.middlewares.CnblogsCrawlDownloaderMiddleware': 543,
}
# 2 下载中间件
-process_request:(请求去,走)
# - return None: 继续处理当次请求,进入下一个中间件
# - return Response: 当次请求结束,把Response丢给引擎处理(可以自己爬,包装成Response)
# - return Request : 相当于把Request重新给了引擎,引擎再去做调度
# - 抛异常:执行process_exception
-process_response:(请求回来,走)
# - return a Response object :继续处理当次Response,继续走后续的中间件
# - return a Request object:重新给引擎做调度
# - or raise IgnoreRequest :process_exception
-process_exception:(出异常,走)
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain :停止异常处理链,给引擎(给爬虫)
# - return a Request object: stops process_exception() chain :停止异常处理链,给引擎(重新调度)
"""
class CnblogsCrawlDownloaderMiddleware(object)
"""
def process_request(self, request, spider):
# 1 加cookie(request.cookies就是你访问网站的cookie)
print(request.cookies)
# 从你的cookie池中取出来,可以修改cookie
request.cookies = {'name': 'alen', 'age': 18}
# 之后你在打印发现就是你修改之后的cookie了
print(request.cookies)
"""
"""
# 前提是你去GitHub下把那个代理池的项目拷贝下来
class CnblogsCrawlDownloaderMiddleware(object):
def get_proxy(self):
import requests
try:
ret = requests.get('http://127.0.0.1:5010/get').json()['proxy']
except:
ret = requests.get('https://127.0.0.1:5010/get').json()['proxy']
print(ret)
return ret
def process_request(self, request, spider):
# 2 加代理
request.meta['proxy'] = self.get_proxy()
print(request.meta['proxy'])
"""
"""
from fake_useragent import UserAgent
ua = UserAgent(verify_ssl=False)
request.headers['User-Agent']=ua.random
print(request.headers)
"""
selenium集成
"""
# 在爬虫已启动,就打开一个chrom浏览器,以后都用这一个浏览器来爬数据
# 1 在爬虫中创建bro对象
bro = webdriver.Chrome(executable_path='/Users/liuqingzheng/Desktop/crawl/cnblogs_crawl/cnblogs_crawl/chromedriver')
# 2 中间件中使用:
spider.bro.get(request.url)
text=spider.bro.page_source
response=HtmlResponse(url=request.url,status=200,body=text.encode('utf-8'))
return response
# 3 关闭,在爬虫中
def close(self, reason):
self.bro.close()
"""