-
案例分析:
- 需求:爬取网易新闻的国内板块下的新闻数据
- 需求分析:当点击国内超链进入国内对应的页面时,会发现当前页面展示的新闻数据是被动态加载出来的,如果直接通过程序对url进行请求,是获取不到动态加载出的新闻数据的。则就需要我们使用selenium实例化一个浏览器对象,在该对象中进行url的请求,获取动态加载的新闻数据。
selenium在scrapy中使用的原理分析:
- 当引擎将国内板块url对应的请求提交给下载器后,下载器进行网页数据的下载,然后将下载到的页面数据,封装到response中,提交给引擎,引擎将response在转交给Spiders。Spiders接受到的response对象中存储的页面数据里是没有动态加载的新闻数据的。要想获取动态加载的新闻数据,则需要在下载中间件中对下载器提交给引擎的response响应对象进行拦截,切对其内部存储的页面数据进行篡改,修改成携带了动态加载出的新闻数据,然后将被篡改的response对象最终交给Spiders进行解析操作。
selenium在scrapy中的使用流程:
- 重写爬虫文件的构造方法,在该方法中使用selenium实例化一个浏览器对象(因为浏览器对象只需要被实例化一次)
- 重写爬虫文件的closed(self,spider)方法,在其内部关闭浏览器对象。该方法是在爬虫结束时被调用
- 重写下载中间件的process_response方法,让该方法对响应对象进行拦截,并篡改response中存储的页面数据
- 在配置文件中开启下载中间件
代码实现:
spider
import scrapy from selenium import webdriver from selenium.webdriver.chrome.options import Options from wangyiPro.items import WangyiproItem """ 爬取网易国内和国际新闻标题和内容 """ class WangyiSpider(scrapy.Spider): name = 'wangyi' # allowed_domains = ['www.163.com'] start_urls = ['https://news.163.com/domestic/','https://news.163.com/world/'] def __init__(self): options = webdriver.ChromeOptions() options.add_argument('--window-position=0,0'); # chrome 启动初始位置 options.add_argument('--window-size=1080,800'); # chrome 启动初始大小 self.browser = webdriver.Chrome(executable_path='C://xx//chromedriver.exe' ,chrome_options=options) def parse(self, response): div_list = response.xpath('//div[@class="ndi_main"]/div') for div_item in div_list: title = div_item.xpath('./div/div[1]/h3/a/text()').extract_first() new_detail_url=div_item.xpath('./div/div[1]/h3/a/@href').extract_first() item = WangyiproItem() item['title'] = title # 对于新闻详情页发起request yield scrapy.Request(url= new_detail_url,callback=self.parse_detail,meta={'item':item}) # 请求传参item # 解析新闻内容 def parse_detail(self,response): content = response.xpath('//*[@id="endText"]//text()').extract() content = ''.join(content) item = response.meta['item'] item['content'] = content.strip() yield item def closed(self,spider): self.browser.quit()
middleware
from scrapy import signals from time import sleep from scrapy.http import HtmlResponse class WangyiproDownloaderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the downloader middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None # 拦截响应对象进行篡改 def process_response(self, request, response, spider): # Called with the response returned from the downloader. #挑选指定的响应对象进行篡改 #通过url指定request #通过request指定response #spider爬虫对象 bro = spider.browser # 获取爬虫类定义的浏览器对象 if request.url in spider.start_urls: #response # 进行篡改 实例化新的响应对象(包含动态加载的新闻数据)替代原来的旧响应对象 # 基于seleium便捷获取动态数据 bro.get(request.url) sleep(3) bro.execute_script('window.scrollTo(0, document.body.scrollHeight)') sleep(1) page_text = bro.page_source # 包含了动态加载对象 new_response = HtmlResponse(url=request.url,body=page_text,encoding="utf-8",request=request) return new_response else: # response # 其他请求 # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain pass def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name)
pipeline文件
import pymysql class WangyiproPipeline(object): # 构造方法 def __init__(self): self.conn = None # 定义一个文件描述符属性 self.cursor = None self.num = 0 # 下列都是在重写父类的方法: # 开始爬虫时,执行一次 def open_spider(self, spider): self.conn = pymysql.Connect(host='192.168.xx.xx', port=3306, user='root', password='xx', db='xx_db', charset='utf8') print('爬虫数据库开始') # 专门处理item对象 # 因为该方法会被执行调用多次,所以文件的开启和关闭操作写在了另外两个只会各自执行一次的方法中。 def process_item(self, item, spider): author = item['title'] content = item['content'] self.cursor = self.conn.cursor() try: self.cursor.execute('insert into qiubai values(%s,%s)', (author, content)) self.conn.commit() except Exception as e: print(e,content[0,20]) self.conn.rollback() return item def close_spider(self, spider): print('爬虫数据库结束') self.cursor.close() self.conn.close()
items文件
class WangyiproItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() content = scrapy.Field() pass
setting配置
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' # 伪装请求载体身份 # Obey robots.txt rules # ROBOTSTXT_OBEY = True ROBOTSTXT_OBEY = False #可以忽略或者不遵守robots协议 #只显示指定类型的日志信息 LOG_LEVEL='ERROR' # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'wangyiPro.middlewares.WangyiproSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { 'wangyiPro.middlewares.WangyiproDownloaderMiddleware': 543, } # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'wangyiPro.pipelines.WangyiproPipeline': 300, }