- 增量式爬虫
- 概念:监测网站数据更新的情况,爬取最新更新出来的数据!
- 实现增量式?
- 去重!
- 电影网站:爬取的数据没有在同一张页面中!
- 需要对每一部电影详情页的url做记录
- 下载执行程序的时候,需要将即将被爬取电影详情页的url做记录监测
- 电影详情页的url记录可以存储到Set或者redis的Set
- 爬取到的所有的电影数据可以存储到redis
- 对一个url对应页面中的数据做监测
- 数据指纹:对一组数据制定的一个唯一标识
对于不在同一页面的数据的案例;
目录文件:
movie.py文件
import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from redis import Redis from zls_movie_Pro.items import ZlsMovieProItem class MovieSpider(CrawlSpider): conn = Redis(host='127.0.0.1',port=6379) name = 'movie' # allowed_domains = ['www.xxx.com'] start_urls = ['http://www.4567kan.com/index.php/vod/show/id/5.html'] rules = ( Rule(LinkExtractor(allow=r'/index.php/vod/show/id/5/page/d+.html'), callback='parse_item', follow=False), ) def parse_item(self, response): #电影名称和详情页的url li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li') for li in li_list: name = li.xpath('.//div[@class="stui-vodlist__detail"]/h4/a/text()').extract_first() item = ZlsMovieProItem() item['name'] = name detail_url = 'http://www.4567kan.com'+li.xpath('.//div[@class="stui-vodlist__detail"]/h4/a/@href').extract_first() #ex == 1 :字符串插入成功 ex == 0 插入的字符串重复了,利用redis的set集合去重 ex = self.conn.sadd('movie_detail_urls',detail_url) if ex == 1: print('有最新更新的数据可爬......') yield scrapy.Request(url=detail_url,callback=self.parse_detail,meta={'item':item}) else: print('暂无数据更新!') def parse_detail(self,response): movie_desc = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first() item = response.meta['item'] item['desc'] = movie_desc yield item
items.py 文件
import scrapy class ZlsMovieProItem(scrapy.Item): # define the fields for your item here like: name = scrapy.Field() desc = scrapy.Field() # pass
pipelines.py文件,将数据存在redis数据库中
class ZlsMovieProPipeline(object): def process_item(self, item, spider): conn = spider.conn conn.lpush('movie_data',item) return item
数据在同一张页面中
- 需求:爬取糗事百科中的段子和作者数据。
爬虫文件:
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from incrementByDataPro.items import IncrementbydataproItem from redis import Redis import hashlib class QiubaiSpider(CrawlSpider): name = 'qiubai' # allowed_domains = ['www.xxx.com'] start_urls = ['https://www.qiushibaike.com/text/'] rules = ( Rule(LinkExtractor(allow=r'/text/page/d+/'), callback='parse_item', follow=True), Rule(LinkExtractor(allow=r'/text/$'), callback='parse_item', follow=True), ) #创建redis链接对象 conn = Redis(host='127.0.0.1',port=6379) def parse_item(self, response): div_list = response.xpath('//div[@id="content-left"]/div') for div in div_list: item = IncrementbydataproItem() item['author'] = div.xpath('./div[1]/a[2]/h2/text() | ./div[1]/span[2]/h2/text()').extract_first() item['content'] = div.xpath('.//div[@class="content"]/span/text()').extract_first() #将解析到的数据值生成一个唯一的标识进行redis存储 source = item['author']+item['content'] source_id = hashlib.sha256(source.encode()).hexdigest() #将解析内容的唯一表示存储到redis的data_id中 ex = self.conn.sadd('data_id',source_id) if ex == 1: print('该条数据没有爬取过,可以爬取......') yield item else: print('该条数据已经爬取过了,不需要再次爬取了!!!')
管道文件:
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html from redis import Redis class IncrementbydataproPipeline(object): conn = None def open_spider(self, spider): self.conn = Redis(host='127.0.0.1', port=6379) def process_item(self, item, spider): dic = { 'author': item['author'], 'content': item['content'] } # print(dic) self.conn.lpush('qiubaiData', dic) return item