zoukankan      html  css  js  c++  java
  • scrapy之小试身手

    要爬取的网址是:http://quotes.toscrape.com/

    磕磕绊绊的写完了

    spiders

    import scrapy
    from kkk.items import *
    
    class QuoteSpider(scrapy.Spider):
        name = 'quote'
        allowed_domains = ['quotes.toscrape.com/']
        start_urls = ['http://quotes.toscrape.com//']
    
        def parse(self, response):
            quotes = response.css('.quote')
            for quote in quotes:               #这是一个很好的方向,页面由重复的构造组成是,拿到大的框架组成的列表,然后循环,依次拿每个框架内的详细信息,yield。
                item = QuoteItem()                 #
                text = quote.css('span.text::text').extract_first()
                author = quote.css('small.author::text').extract_first()
                tags = quote.css('div.tags > a.tag::text').extract()
                item['text'] = text
                item['author'] = author
                item['tags'] = tags
                yield item
            url = response.css('li.next > a::attr(href)').extract_first()
            next_url = response.urljoin(url)                           # response.urljoin()
            yield scrapy.Request(url=next_url,callback=self.parse)     # yield Request对象

      Pipeline

      收获:

       1 这里都是类,完全可以定义__init__,完全可以把一些数据 添加到其静态属性。 

    import pymongo
    from scrapy.exceptions import DropItem
    
    class QuotePipeline1(object):
        def __init__(self):
            self.limit = 40
        def process_item(self,item,spider):
            if item:
                if len(item['text'])>self.limit:
                    item['text'] = item['text'][:self.limit]+'...'
                return item
            else:
                return DropItem('{}扔掉'.format(item))
    
    class QuotePipeline2(object):
        def __init__(self,uri,port,db):
            self.uri = uri
            self.port = port
            self.db = db
        @classmethod
        def from_crawler(cls,crawler):                             # from_crawler(cls,crawler) 类方法,可以从settings中获取 值。
            mongo_uri = crawler.settings.get('MONGO_URI')
            mongo_port = crawler.settings.get('MONGO_PORT')
            mongo_db = crawler.settings.get('MONGO_DB')
            return cls(
                mongo_uri,
                mongo_port,
                mongo_db,
            )
        def process_item(self,item,spider):
            name = item.__class__.__name__                # __class__.__name__返回的是 这个数据的 类 的名成。即在items 中定义的 class QuoteItem():
            self.db[name].insert(dict(item))
            return item
        def open_spider(self,spider):
            self.client = pymongo.MongoClient(self.uri,self.port)
            self.db = self.client[self.db]
        def close_spider(self,spider):
            self.client.close()
  • 相关阅读:
    Memcached学习(一)
    了解下SoftReference
    Java Thread UncaughtExceptionHandler
    csdn 不登录浏览全文 chrome 浏览器
    postgresql 主从 patroni
    熔断,限流,降级
    CentOS7.4 源码安装MySQL8.0
    CentOS BIND9安装及配置
    Linux安全之SYN攻击原理及其应对措施
    fping常用参数介绍
  • 原文地址:https://www.cnblogs.com/654321cc/p/8878669.html
Copyright © 2011-2022 走看看