增量式
-
概念:检测网站数据更新的情况。爬取到最新更新出来的数据。
-
核心:去重
-
记录表:需要持久化存储。redis中set
- 记录爬取过的信息
- 爬取过的电影详情页的url:对应的是深度爬取
- 数据指纹:对应的非深度爬取(一张页面的数据更新)
- 数据指纹:一组数据的唯一标识
- 记录爬取过的信息
-
代码实现:
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from redis import Redis from moviePro.items import MovieproItem class MovieSpider(CrawlSpider): name = 'movie' # allowed_domains = ['www.xxx.com'] start_urls = ['https://www.4567tv.tv/frim/index1.html'] conn = Redis(host='127.0.0.1',port=6379) link = LinkExtractor(allow=r'frim/index1-d+.html')#提取页码链接 rules = ( Rule(link, callback='parse_item', follow=False), ) def parse_item(self, response): #电影名称+详情页的url li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li') for li in li_list: name = li.xpath('./div/a/@title').extract_first() item = MovieproItem() item['name'] = name detail_url = 'https://www.4567tv.tv'+li.xpath('./div/a/@href').extract_first() ex = self.conn.sadd('movie_record',detail_url) if ex == 1:#这部电影之前没有存在于记录表中 print('有最新更新的数据!!!!!!') yield scrapy.Request(url=detail_url,callback=self.parse_detail,meta={'item':item}) else: print('暂无新数据的更新......') def parse_detail(self,response): item = response.meta['item'] desc = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first() item['desc'] = desc yield item
pipelines.py文件的代码:
class MovieproPipeline(object): def process_item(self, item, spider): conn = spider.conn conn.lpush('movieData',item) return item