爬取过的数据跳过
1、通过url判断
2、通过数据指纹判断
创建爬虫项目 :scrapy startproject xxx
cd xxx
创建爬虫文件:scrapy genspider -t crawl spidername www.xxx.com
一、根据url判断
爬虫文件
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from redis import Redis 4 from scrapy.linkextractors import LinkExtractor 5 from scrapy.spiders import CrawlSpider, Rule 6 from increment1.items import Increment1Item 7 """ 8 爬取过的数据跳过 9 1、通过url判断 10 2、通过数据指纹判断 11 """ 12 13 14 class FirstSpider(CrawlSpider): 15 name = 'first' 16 # allowed_domains = ['www.xxx.com'] 17 start_urls = ['https://www.4567tv.tv/index.php/vod/show/id/7.html'] 18 19 rules = ( 20 Rule(LinkExtractor(allow=r'/index.php/vod/show/id/7/page/d+.html'), callback='parse_item', follow=True), 21 ) 22 23 def parse_detail(self, response): 24 item = response['item'] 25 actor = response.xpath('//div[@class="stui-content__detail"]/p[3]//text()').extract_first() 26 item['actor'] = actor 27 yield item 28 29 def parse_item(self, response): 30 31 conn = Redis(host='127.0.0.1', port=6388) 32 detail_url_list = "https://www.4567tv.tv" + response.xpath('//li[@class="col-md-6 col-sm-4 col-xs-3"]/div/a/@href').extract_first() 33 for url in detail_url_list: 34 item = Increment1Item() 35 ex = conn.sadd('movies_url', url) 36 if ex == 1: # 说明redis里没有该url 37 yield scrapy.Request(url=url, callback=self.parse_detail, meta={"item", item}) 38 else: 39 print('爬过的数据不在爬取!!!')
在管道文件里进行存储
from redis import Redis class Increment1Pipeline(object): def open_spider(self, spider): self.conn = Redis(host='127.0.0.1', port=6388) def process_item(self, item, spider): print('新数据写入') dic = { 'actor': item['actor'] } self.conn.lpush('move_data', dic)
二、根据数据进行指纹识别
爬虫文件
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from scrapy.linkextractors import LinkExtractor 4 from scrapy.spiders import CrawlSpider, Rule 5 from incerment2.items import Incerment2Item 6 import hashlib 7 from redis import Redis 8 9 10 class FirstSpider(CrawlSpider): 11 name = 'first' 12 # allowed_domains = ['www.xxx.com'] 13 start_urls = ['https://www.qiushibaike.com/text/'] 14 15 rules = ( 16 Rule(LinkExtractor(allow=r'/text/page/d+/'), callback='parse_item', follow=True), 17 ) 18 19 def parse_item(self, response): 20 self.conn = Redis(host='127.0.0.1', port=6388) 21 22 div_list = response.xpath('//div[@class="content-left"]/div') 23 for div in div_list: 24 item = Incerment2Item() 25 item['author'] = div.xpath('./div[1]/a[2]/h2/text()').extract_first() 26 content = div.xpath('./a[1]/div/span/text()').extract() 27 content = ''.join(content) 28 item['content'] = content 29 30 # 将当前爬取的数据做哈希唯一标识(数据指纹) 31 source = item['author']+item['content'] 32 hashValue = hashlib.sha256(source.encode()).hexdigest() 33 34 ex = self.conn.sadd('hashValue', hashValue) 35 if ex == 1: 36 yield item 37 38 else: 39 print('数据未更新')