zoukankan      html  css  js  c++  java
  • scrapy爬取小说盗墓笔记

    # -*- coding: utf-8 -*-
    import scrapy
    from daomu.items import DaomuItem
    
    class DaomuspiderSpider(scrapy.Spider):
        name = "daomuspider"
        # allowed_domains = ["www.daomubiji.com"]
        start_urls = ['http://www.daomubiji.com/']
        index_url = 'http://www.daomubiji.com/'
    
        def start_requests(self):
            yield scrapy.Request(url=self.index_url,callback=self.parse_book)
    
        def parse_book(self, response):
            for url in response.css('.article-content a'):
                book_url = url.css('a::attr(href)').extract_first()
                yield scrapy.Request(url=book_url, callback=self.parse_chapter)
    
        def parse_chapter(self, response):
            item = DaomuItem()
            book_title = response.css('.focusbox .container h1::text').extract_first()
            book_info = response.css('.focusbox .container .focusbox-text::text').extract_first()
            book_url = response.url
    
            for chapter in response.css('.excerpts-wrapper .excerpts .excerpt'):
                chapter_title = chapter.css('a::text').extract_first().split(' ')[1] + ':'+ chapter.css('a::text').extract_first().split(' ')[-1]
                chapter_url = chapter.css('a::attr(href)').extract_first()
                
                item['book_title'] = book_title
                item['book_info'] = book_info
                item['book_url'] = book_url
                item['chapter_title'] = chapter_title
                item['chapter_url'] = chapter_url
                
                yield item
                yield scrapy.Request(url = chapter_url,callback=self.parse_detail, meta={'item':item})#重点在这里,用meta进行转移到下一个函数
    
    
        def parse_detail(self, response):
            item = response.meta['item']
            content = response.css('.article-content p::text').extract()
            item['content'] = content
            yield item
    import pymongo
    
    class DaomuPipeline(object):
    
        def __init__(self):
            self.mongo_uri = 'localhost'
            self.mongo_db = 'daomu'
    
        # @classmethod
        # def frow_crawler(cls, crawler):
        #     return cls(
        #         mongo_uri = crawler.settings.get('MONGO_URI'),
        #         mongo_db = crawler.settings.get('MONGO_DB')
        #     )
    
        def open_spider(self,spider):
            self.client = pymongo.MongoClient(self.mongo_uri)
            self.db = self.client[self.mongo_db]
    
        def process_item(self, item, spider):
            name = item.__class__.__name__
            self.db[name].insert(dict(item))#一定要注意这里用dict
            return item
    
        def close_spider(self, spider):
            self.client.close()
  • 相关阅读:
    秒杀项目之——通过令牌发放机制,控制秒杀大闸以及队列泄洪
    Guava的使用
    秒杀项目之细节随笔记录————
    redis搭建集群
    redis内存满了怎么办?
    多个电脑上免密登陆命令、scp远程拷贝、修改文件的用户和组
    克隆完虚拟机后修改网卡,修改静态IP
    redis基本操作和 过期时间设置以及持久化方案
    linux操作系统安装运行Redis
    云服务器以及linux操作系统打开防火墙,在墙上开一个小口
  • 原文地址:https://www.cnblogs.com/themost/p/7093116.html
Copyright © 2011-2022 走看看