scrapy + selenuim:
使用流程:
1, 重写爬虫文件的构造方法,在该方法中使用selenium实例化一个浏览器对象(因为浏览器对象只需要被实例化一次)
2. 重写爬虫文件的closed(self,spider)方法,在其内部关闭浏览器对象。该方法是在爬虫结束时被调用
3. 重写下载中间件的process_response方法,让该方法对响应对象进行拦截,并篡改response中存储的页面数据
4. 在配置文件中开启下载中间件
测试:
spider 文件:
class WangyiSpider(RedisSpider):
name = 'wangyi'
#allowed_domains = ['www.xxxx.com']
start_urls = ['https://news.163.com']
def __init__(self):
#实例化一个浏览器对象(实例化一次)
self.bro = webdriver.Chrome(executable_path='/Users/bobo/Desktop/chromedriver')
#必须在整个爬虫结束后,关闭浏览器
def closed(self,spider):
print('爬虫结束')
self.bro.quit()
中间件文件:
from scrapy.http import HtmlResponse
#参数介绍:
#拦截到响应对象(下载器传递给Spider的响应对象)
#request:响应对象对应的请求对象
#response:拦截到的响应对象
#spider:爬虫文件中对应的爬虫类的实例
def process_response(self, request, response, spider):
#响应对象中存储页面数据的篡改
if request.url in['http://news.163.com/domestic/','http://news.163.com/world/','http://news.163.com/air/','http://war.163.com/']:
spider.bro.get(url=request.url)
js = 'window.scrollTo(0,document.body.scrollHeight)'
spider.bro.execute_script(js)
time.sleep(2) #一定要给与浏览器一定的缓冲加载数据的时间
#页面数据就是包含了动态加载出来的新闻数据对应的页面数据
page_text = spider.bro.page_source
#篡改响应对象
return HtmlResponse(url=spider.bro.current_url,body=page_text,encoding='utf-8',request=request)
else:
return response
DOWNLOADER_MIDDLEWARES = {
'wangyiPro.middlewares.WangyiproDownloaderMiddleware': 543,
}
实战2:
BOT_NAME = 'wangyipro'
SPIDER_MODULES = ['wangyipro.spiders']
NEWSPIDER_MODULE = 'wangyipro.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.90 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'wangyipro.middlewares.WangyiproDownloaderMiddleware': 543,
}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'wangyipro.pipelines.WangyiproPipeline': 300,
}
LOG_LEVEL = 'ERROR'
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import time
from scrapy import signals
from scrapy.http import HtmlResponse
class WangyiproDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
def process_request(self, request, spider):
return None
# 该方法拦截到的是所有的响应对象(需求中要拦截的是部分响应对象)
def process_response(self, request, response, spider):
"""
找出指定的响应对象进行解析操作
可以根据指定的请求对象定位到指定的响应对象
指定的请求对象可以通过请求的URL进行定位
定位到指定的URL spider.model_urls
:param request:
:param response:
:param spider:
:return:
"""
bro = spider.bro
if request.url in spider.model_urls:
# 通过指定的URL就定位到了指定的request
# 通过指定的requests就可以定位到指定的response(不符合要求的旧响应对象)
# 自己手动创建四个符合要求的新响应对象(需要将符合要求的响应数据放置到新的响应对象中)
# 使用新的响应对象替换原来原始的响应对象
bro.get(request.url) # 使用浏览器向对应的URL发起请求
time.sleep(2)
js = "window.scrollTo(0, document.body.scrollHeight)"
bro.execute_script(js)
time.sleep(2)
page_text = bro.page_source # 页面源码中就包含了动态加载出来的页面数据
# 手动创建一个新的响应对象,将page_text作为响应数据封装到该对象中
# body参数表示的就是响应数据
return HtmlResponse(url=bro.current_url, body=page_text, encoding='utf-8', request=request)
return response
def process_exception(self, request, exception, spider):
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
中间件文件
# -*- coding: utf-8 -*-
import scrapy
from wangyipro.items import WangyiproItem
from selenium import webdriver
class WangyiSpider(scrapy.Spider):
name = 'wangyi'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://news.163.com/']
model_urls = [] # 里面放置的是四个版块对应的详情页URL
def __init__(self):
self.bro = webdriver.Chrome(executable_path=r"C:UsersAdministratorDesktop爬虫day06-爬虫chromedriver.exe")
def news_content_parse(self, response):
"""
用来解析新闻详情页的内容, 需要接收一下传递过来的item对象
:param response:
:return:
"""
item = response.meta['item']
# extract()方法获取到的是一个列表,列表里面存的是字符串
content_list = response.xpath('//div[@id="endText"]//text()').extract()
item['news_content'] = "".join(content_list)
yield item
def parse_detail(self, response):
"""
用来解析这个版块对应页面的新闻数据, 在这里我们就只爬取新闻的名称
:param response:
:return:
"""
div_list = response.xpath('//div[@class="data_row news_article clearfix "]')
for div in div_list:
# 当前只解析到了新闻标题,还没有解析到新闻内容
item = WangyiproItem()
news_title = div.xpath('./div/div[1]//a/text()').extract_first()
news_detail_url = div.xpath('./div/div[1]//a/@href').extract_first()
item['news_title'] = news_title
# 发送请求,并获取新闻内容, 使用请求传参,将item传递给下一个解析方法
yield scrapy.Request(url=news_detail_url, callback=self.news_content_parse, meta={'item': item})
def parse(self, response):
"""
解析四个版块对应的URL
只有在取文本或取属性的时候才可以在xpath中调用extract操作
:param response:
:return:
"""
li_list = response.xpath('//div[@class="ns_area list"]/ul/li')
indexs = [3, 4, 6, 7]
model_li_list = [] # 此列表放置四个版块对应的li标签
for index in indexs:
li = li_list[index]
model_li_list.append(li)
print(model_li_list)
# 解析出四个版块对应的URL
for li in model_li_list:
model_url = li.xpath('./a/@href').extract_first()
self.model_urls.append(model_url)
# 接下来对每个版块的URL发起请求, 并获取详情页的页面源码数据
yield scrapy.Request(url=model_url, callback=self.parse_detail)
爬虫文件