zoukankan html css js c++ java

scrapy数据增量式爬取

爬取过的数据跳过
1、通过url判断
2、通过数据指纹判断

创建爬虫项目：scrapy startproject xxx

cd xxx

创建爬虫文件：scrapy genspider -t crawl spidername www.xxx.com

一、根据url判断

爬虫文件

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from redis import Redis
 4 from scrapy.linkextractors import LinkExtractor
 5 from scrapy.spiders import CrawlSpider, Rule
 6 from increment1.items import Increment1Item
 7 """
 8 爬取过的数据跳过
 9 1、通过url判断
10 2、通过数据指纹判断
11 """
12 
13 
14 class FirstSpider(CrawlSpider):
15     name = 'first'
16     # allowed_domains = ['www.xxx.com']
17     start_urls = ['https://www.4567tv.tv/index.php/vod/show/id/7.html']
18 
19     rules = (
20         Rule(LinkExtractor(allow=r'/index.php/vod/show/id/7/page/d+.html'), callback='parse_item', follow=True),
21     )
22 
23     def parse_detail(self, response):
24         item = response['item']
25         actor = response.xpath('//div[@class="stui-content__detail"]/p[3]//text()').extract_first()
26         item['actor'] = actor
27         yield item
28 
29     def parse_item(self, response):
30        
31         conn = Redis(host='127.0.0.1', port=6388)
32         detail_url_list = "https://www.4567tv.tv" + response.xpath('//li[@class="col-md-6 col-sm-4 col-xs-3"]/div/a/@href').extract_first()
33         for url in detail_url_list:
34             item = Increment1Item()
35             ex = conn.sadd('movies_url', url)
36             if ex == 1:   # 说明redis里没有该url
37                 yield scrapy.Request(url=url, callback=self.parse_detail, meta={"item", item})
38             else:
39                 print('爬过的数据不在爬取！！！')

在管道文件里进行存储

from redis import Redis


class Increment1Pipeline(object):
    def open_spider(self, spider):
        self.conn = Redis(host='127.0.0.1', port=6388)

    def process_item(self, item, spider):
        print('新数据写入')
        dic = {
            'actor': item['actor']
        }
        self.conn.lpush('move_data', dic)

二、根据数据进行指纹识别

爬虫文件

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from scrapy.linkextractors import LinkExtractor
 4 from scrapy.spiders import CrawlSpider, Rule
 5 from incerment2.items import Incerment2Item
 6 import hashlib
 7 from redis import Redis
 8 
 9 
10 class FirstSpider(CrawlSpider):
11     name = 'first'
12     # allowed_domains = ['www.xxx.com']
13     start_urls = ['https://www.qiushibaike.com/text/']
14 
15     rules = (
16         Rule(LinkExtractor(allow=r'/text/page/d+/'), callback='parse_item', follow=True),
17     )
18 
19     def parse_item(self, response):
20         self.conn = Redis(host='127.0.0.1', port=6388)
21 
22         div_list = response.xpath('//div[@class="content-left"]/div')
23         for div in div_list:
24             item = Incerment2Item()
25             item['author'] = div.xpath('./div[1]/a[2]/h2/text()').extract_first()
26             content = div.xpath('./a[1]/div/span/text()').extract()
27             content = ''.join(content)
28             item['content'] = content
29 
30             # 将当前爬取的数据做哈希唯一标识(数据指纹)
31             source = item['author']+item['content']
32             hashValue = hashlib.sha256(source.encode()).hexdigest()
33 
34             ex = self.conn.sadd('hashValue', hashValue)
35             if ex == 1:
36                 yield item
37 
38             else:
39                 print('数据未更新')

查看全文

相关阅读:
C++引用之引用的使用
 C++引用之声明方法
 C++const与指针
 C++默认参数值函数
 LeanCloud 调研报告
 [译] 为何流处理中局部状态是必要的
 Z-Stack
Think twice before starting the adventure
Multi-pattern string match using Aho-Corasick
C pointer again …

原文地址：https://www.cnblogs.com/liaopeng123/p/10479130.html