要爬取的网址是:http://quotes.toscrape.com/
磕磕绊绊的写完了
spiders
import scrapy from kkk.items import * class QuoteSpider(scrapy.Spider): name = 'quote' allowed_domains = ['quotes.toscrape.com/'] start_urls = ['http://quotes.toscrape.com//'] def parse(self, response): quotes = response.css('.quote') for quote in quotes: #这是一个很好的方向,页面由重复的构造组成是,拿到大的框架组成的列表,然后循环,依次拿每个框架内的详细信息,yield。 item = QuoteItem() # text = quote.css('span.text::text').extract_first() author = quote.css('small.author::text').extract_first() tags = quote.css('div.tags > a.tag::text').extract() item['text'] = text item['author'] = author item['tags'] = tags yield item url = response.css('li.next > a::attr(href)').extract_first() next_url = response.urljoin(url) # response.urljoin() yield scrapy.Request(url=next_url,callback=self.parse) # yield Request对象
Pipeline
收获:
1 这里都是类,完全可以定义__init__,完全可以把一些数据 添加到其静态属性。
import pymongo from scrapy.exceptions import DropItem class QuotePipeline1(object): def __init__(self): self.limit = 40 def process_item(self,item,spider): if item: if len(item['text'])>self.limit: item['text'] = item['text'][:self.limit]+'...' return item else: return DropItem('{}扔掉'.format(item)) class QuotePipeline2(object): def __init__(self,uri,port,db): self.uri = uri self.port = port self.db = db @classmethod def from_crawler(cls,crawler): # from_crawler(cls,crawler) 类方法,可以从settings中获取 值。 mongo_uri = crawler.settings.get('MONGO_URI') mongo_port = crawler.settings.get('MONGO_PORT') mongo_db = crawler.settings.get('MONGO_DB') return cls( mongo_uri, mongo_port, mongo_db, ) def process_item(self,item,spider): name = item.__class__.__name__ # __class__.__name__返回的是 这个数据的 类 的名成。即在items 中定义的 class QuoteItem(): self.db[name].insert(dict(item)) return item def open_spider(self,spider): self.client = pymongo.MongoClient(self.uri,self.port) self.db = self.client[self.db] def close_spider(self,spider): self.client.close()