zoukankan html css js c++ java

Scrapy之持久化pipelines/items

基于上篇博文存在的问题

　　https://www.cnblogs.com/Alexephor/p/11432195.html　　

　　-无法完成爬虫刚开始打开连接爬虫关闭时：关闭连接
　　-分工不明确

本篇博文解决以上两问题

　　上篇博文不足之处主要体现在爬虫部分parse中在爬数据的过程操作了实时打开关闭文件处理，而且爬虫逻辑分工不明确

　　处理方式：1.在爬虫开始位置就打开文件操作或者数据库，在爬完之后就关闭掉即可

　　　　　　　2.爬的数据应该交给process_item处理

Chouti.py

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from wyb.items import WybItem
 4 
 5 
 6 class ChoutiSpider(scrapy.Spider):
 7     name = 'chouti'
 8     # 爬取定向的网页 只允许这个域名的
 9     allowed_domains = ['chouti.com']
10     start_urls = ['http://chouti.com/']
11 
12     def parse(self, response):
13         from scrapy.http.response.html import HtmlResponse
14         # print(response, type(response))
15         # print(response.text)
16         item_list = response.xpath('//div[@id="content-list"]/div[@class="item"]')
17         for item in item_list:
18             text = item.xpath('.//a/text()').extract_first()
19             href = item.xpath('.//a/@href').extract_first()
20             yield WybItem(text=text, href=href)        
21         page_list = response.xpath('//div[@id="dig_lcpage"]//a/@href').extract()
22         for page in page_list:
23             from scrapy.http import Request
24             page = "https://dig.chouti.com"+page
25             # 继续发请求，回调函数parse
26             yield Request(url=page, callback=self.parse)

pipelines.py

 1 # -*- coding: utf-8 -*-
 2 
 3 # Define your item pipelines here
 4 #
 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 7 
 8 from .settings import HREF_FILE_PATH
 9 from scrapy.exceptions import DropItem
10 
11 
12 class WybPipeline(object):
13     def __init__(self, path):
14         self.f = None
15         self.path = path
16 
17     @classmethod
18     def from_crawler(cls, crawler):
19         """
20         初始化时候，用于创建pipline对象
21         :param crawler:
22         :return:
23         """
24         path = crawler.settings.get('HREF_FILE_PATH')
25         return cls(path)
26 
27     def open_spider(self, spider):
28         """
29         爬虫开始执行被调用
30         :param spider:
31         :return:
32         """
33         # if spider.name == "chouti":
34         # self.f = open(HREF_FILE_PATH, 'a+')
35         self.f = open(self.path, 'a+')
36 
37     def process_item(self, item, spider):
38         # item就是yield返回的内容
39         # spider就是当前ChoutiSpider类的实例
40         # f = open('news.log', 'a+')
41         # f.write(item['href'])
42         # f.close()
43         self.f.write(item['href'] + '
')
44         # return item   # 不交给下一个pipeline的process_item去处理
45         raise DropItem()  # 后续的 pipeline的process_item不再执行了
46 
47     def close_spider(self, spider):
48         """
49         爬虫关闭时调用
50         :param spider:
51         :return:
52         """
53         self.f.close()
54 
55 
56 class DBPipeline(object):
57     def __init__(self, path):
58         self.f = None
59         self.path = path
60 
61     @classmethod
62     def from_crawler(cls, crawler):
63         """
64         初始化时候，用于创建pipline对象
65         :param crawler:
66         :return:
67         """
68         path = crawler.settings.get('HREF_DB_PATH')
69         return cls(path)
70 
71     def open_spider(self, spider):
72         """
73         爬虫开始执行被调用
74         :param spider:
75         :return:
76         """
77         # self.f = open(HREF_DB_PATH, 'a+')
78         self.f = open(self.path, 'a+')
79 
80     def process_item(self, item, spider):
81         # item就是yield返回的内容
82         # spider就是当前ChoutiSpider类的实例
83         # f = open('db.log', 'a+')
84         # f.write(item['href'])
85         # f.close()
86         self.f.write(item['href'] + '
')
87         return item
88 
89     def close_spider(self, spider):
90         """
91         爬虫关闭时调用
92         :param spider:
93         :return:
94         """
95         self.f.close()

items.py

 1 # -*- coding: utf-8 -*-
 2 
 3 # Define here the models for your scraped items
 4 #
 5 # See documentation in:
 6 # https://docs.scrapy.org/en/latest/topics/items.html
 7 
 8 import scrapy
 9 
10 
11 class WybItem(scrapy.Item):
12     # define the fields for your item here like:
13     # name = scrapy.Field()
14     title = scrapy.Field()
15     href = scrapy.Field()

settings.py

 1 ITEM_PIPELINES = {
 2    'wyb.pipelines.WybPipeline': 300,
 3    'wyb.pipelines.DBPipeline': 301,
 4     # 优先级 0-1000   越小优先级越快
 5 }
 6 
 7 
 8 # 保存路径
 9 HREF_FILE_PATH = 'news.log'
10 HREF_DB_PATH = 'db.log'

总结：　　

 1     - 持久化 piplines/items
 2         a.先写piplines类
 3             class WybPipeline(object):
 4                 def process_item(self, item, spider):
 5                     print(item)
 6                     # item就是yield返回的内容
 7                     # spider就是当前ChoutiSpider类的实例
 8                     return item
 9         b.写Item类
10             class WybItem(scrapy.Item):
11                 # define the fields for your item here like:
12                 # name = scrapy.Field()
13                 title = scrapy.Field()
14                 href = scrapy.Field()
15         c.配置文件
16             ITEM_PIPELINES = {
17                        'wyb.pipelines.WybPipeline': 300,
18                         # 优先级 0-1000   越小优先级越高
19                     }
20         d.爬虫  yield执行一次就执行一次process_item
21             yield WybItem(text=text, href=href)

对pipeline文件详情部分

　　源码流程内容

1  1.判断当前WybPipeline内中是否有from_crawler
2               有：
3                     obj = WybPipeline.from_crawler()
4               无：
5                     obj = WybPipeline()
6  2.obj.open_spider()
7  3.obj.process_item()/obj.process_item()/obj.process_item()/
8  4.obj.close_spider()

在这里看出来再定义Pipeline类时：

　　五方法　　

 1 __init__(self, path)
 2 
 3 @classmethod
 4 from_crawler(cls, crawler)
 5 
 6 open_spider(self, spider)
 7 
 8 process_item(self, item, spider)
 9 
10 close_spider(self, spider)

查看全文

相关阅读:
AWK用法详解
 追加内容到指定的行
 自动scp(二)
Spring 容器IOC解析及简单实现
 Spring 容器AOP的实现原理——动态代理
 Try语句中有return，那么finally中的code会执行吗？什么时候执行？
Java中HashMap和TreeMap的区别
 HashTable和HashMap的区别详解
 ArrayList、LinkedList与Vector的对比
 事务是什么

原文地址：https://www.cnblogs.com/Alexephor/p/11436726.html