Django: # 创建project django-admin startproject mysite cd mysite # 创建app python manage.py startapp app01 python manage.py startapp app02 # 启动项目 python manage.py runserver scrapy: # 创建project 项目名称 scrapy startproject xdb cd xdb #创建爬虫 爬虫名称 爬虫地址 scrapy genspider chouti chouti.com scrapy genspider cnblogs cnblogs.com # 启动爬虫 scrapy crawl chouti scrapy crawl chouti --nolog
""" 源码内容: 1. 判断当前XdbPipeline类中是否有from_crawler 有:obj = XdbPipeline.from_crawler(...) 否:obj = XdbPipeline() 2. obj.open_spider() 3. obj.process_item()|obj.process_item()|obj.process_item()| 4. obj.close_spider() """ from scrapy.exceptions import DropItem class XdbPipeline(object): def __init__(self, path): self.f = None self.path = path @classmethod def from_crawler(cls, crawler): ''' 初始化时候,用于创建pipeline对象 :param crawler: :return: ''' path = crawler.settings.get('HREF_FILE_PATH') return cls(path) def open_spider(self, spider): ''' 爬虫开始执行时,调用 :param spider: :return: ''' self.f = open(self.path, 'a+') def process_item(self, item, spider): # print(item.get("text")) self.f.write(item.get('href') + ' ') return item # 交给下一个Pipleline中的process_item方法去执行
return DropItem() # 后续的Pipeline中的process_item方法不再执行 def close_spider(self, spider): ''' 爬虫关闭时,被调用 :param spider: :return: ''' self.f.close()
持久化:pipelines pipelines.py class XdbPipeline(object): def __init__(self, path): self.f = None self.path = path @classmethod def from_crawler(cls, crawler): path = crawler.settings.get('HREF_FILE_PATH') return cls(path) def open_spider(self, spider): self.f = open(self.path, 'a+') def process_item(self, item, spider): # print(item.get("text")) self.f.write(item.get('href') + ' ') return item def close_spider(self, spider): self.f.close() settings.py ITEM_PIPELINES = { 'xdb.pipelines.XdbPipeline': 300, # 数字越小优先级越高, 范围0--1000 } items.py import scrapy class XdbItem(scrapy.Item): text = scrapy.Field() href = scrapy.Field() chouti.py import scrapy xdb.items import XdbItem class ChoutiSpider(scrapy.Spider): name = 'chouti' allowed_domains = ['chouti.com'] start_urls = ['http://chouti.com/'] def parse(self, response): content_list = response.xpath('//div[@class="link-con"]//div[@class="link-detail"]') for item in content_list: text = item.xpath('./a/text()').extract_first() href = item.xpath('./a/@href').extract_first() yield XdbItem(text=text, href=href)