zoukankan      html  css  js  c++  java
  • scrapy爬小说程序(mongodb版)的完善

    一、背景:原程序爬取小说要求一次成功,否则,必须从头再来,影响爬取效率。

    二、完善思路

    (1)增加对已爬取内容的检索,若mongodb已有内容,则不再爬取。

    (2)增加对总爬取时间的计时。

    三、代码

    (1)xbiquge/pipelines.py

    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    import os
    import time
    from twisted.enterprise import adbapi
    from pymongo import MongoClient
    
    class XbiqugePipeline(object):
        conn = MongoClient('mongodb://admin:admin@localhost:27017/admin')
        db = conn.novels #建立数据库novels的连接对象db
        name_novel = ''
        url_firstchapter = ''
        name_txt = ''
        start_time=time.time()
    
        #定义类初始化动作
        def __init__(self):
    
            return
    
        #爬虫开始
        def open_spider(self, spider):
    
            return
    
        def get_collection(self,name_collection):  #获取数据集cursor对象
            myset = self.db[name_collection]
            return myset
    
        def process_item(self, item, spider):
            #if self.name_novel == '':
            self.name_novel = item['name']
            self.url_firstchapter = item['url_firstchapter']
            self.name_txt = item['name_txt']
            myset = self.db[self.name_novel]
            myset.insert_one(dict(item))
    #        if self.name_novel != '':
    #            exec('self.db.'+ self.name_novel + '.insert_one(dict(item))')
            return item
    
        #从数据库取小说章节内容写入txt文件
        def content2txt(self,dbname,firsturl,txtname):
            myset = self.db[dbname]
            record_num = myset.find().count() #获取小说章节数量
            print("小说总章节数:",record_num)
            counts=record_num
            url_c = firsturl
            start_time=time.time()  #获取提取小说内容程序运行的起始时间
            f = open(txtname+".txt", mode='w', encoding='utf-8')   #写方式打开小说名称加txt组成的文件
            for i in range(counts):  #括号中为counts
    #-----------使用count()方法获得的返回整型值作为是否获得数据的判断依据-------------
    #            record_m_count=myset.find({"url": url_c},{"content":1,"_id":0}).count()
    #            if record_m_count == 0:
    #               print("数据集中没有找到章节内容。
    出错url:",url_c)
    #               break
    #--------------------------------------------------------------------------------
    
    #-----------使用next()方法读取迭代器数据,并使用try except捕获未获得数据的错误-----
                try:
                    record_m=myset.find({"url": url_c},{"content":1,"_id":0}).next()
                #except Exception as e:
                except StopIteration:
                    print("数据集中没有获得章节内容。
    出错url:",url_c)
                    break   #跳出for循环,终止小说文件生成
    #--------------------------------------------------------------------------------
                record_content_c2a0 = ''
    
    #------------使用for循环读取迭代器数据模式---------------------------------
    #            record_i = myset.find({"url": url_c},{"content":1,"_id":0})
    #            for record_m in record_i:
    #                record_content_c2a0 = record_m["content"]  #获取小说章节内容
    #---------------------------------------------------------------------------
                record_content_c2a0 = record_m["content"]
    
                #record_content=record_content_c2a0.replace(u'xa0', u'')  #消除特殊字符xc2xa0
                record_content=record_content_c2a0
                #print(record_content)
                f.write('
    ')
                f.write(record_content + '
    ')
                f.write('
    
    ')
                url_ct = myset.find({"url": url_c},{"next_page":1,"_id":0})  #获取下一章链接的查询对象
                for item_url in url_ct:
                    url_c = item_url["next_page"]  #下一章链接地址赋值给url_c,准备下一次循环。
                    #print("下一页",url_c)
            f.close()
            print("文件生成用时:",time.time()-start_time)
            print("小说爬取总用时:",time.time()-self.start_time)
            print(txtname + ".txt" + " 文件已生成!")
            return
    
        #爬虫结束,调用content2txt方法,生成txt文件
        def close_spider(self,spider):
            if self.name_novel !='' and self.url_firstchapter != '' and self.name_txt != '':
                self.content2txt(self.name_novel,self.url_firstchapter,self.name_txt)
            return
    

    (2)爬虫示例代码xbiquge/spiders/sancun.py

    # -*- coding: utf-8 -*-
    import scrapy
    from xbiquge.items import XbiqugeItem
    from xbiquge.pipelines import XbiqugePipeline
    
    class SancunSpider(scrapy.Spider):
        name = 'sancun'
        allowed_domains = ['www.xbiquge.la']
        #start_urls = ['http://www.xbiquge.la/10/10489/']
        url_ori= "https://www.xbiquge.la"
        url_firstchapter = "https://www.xbiquge.la/10/10489/4534454.html"
        name_txt = "./novels/三寸人间"
        url_chapters = url_firstchapter[0:32]
        pipeline=XbiqugePipeline()
        novelcollection=pipeline.get_collection(name) #获取小说数据集cursor对象,mongodb的数据集(collection)相当于mysql的数据表table
        #--------------------------------------------
        #如果next_page的值是小说目录页面url,则把包含目录页面的记录删除,以免再次抓取时,出现多>个目录页面url,使得无法获得最新内容。
        if novelcollection.find({"next_page":url_chapters}).count() != 0 :
            print("包含目录页面url的记录数:",novelcollection.find({"next_page":url_chapters}).count())
            novelcollection.remove({"next_page":url_chapters})
            print("已删除包含目录页面url的记录。")
        #--------------------------------------------
        novelcounts=novelcollection.find().count()
        novelurls=novelcollection.find({},{"_id":0,"id":1,"url":1})
        item = XbiqugeItem()
        item['id'] = novelcounts         #id置初值为colletion的记录总数
        item['name'] = name
        item['url_firstchapter'] = url_firstchapter
        item['name_txt'] = name_txt
    
        def start_requests(self):
            start_urls = [self.url_chapters]
            print("小说目录url:",start_urls)
            for url in start_urls:
                yield scrapy.Request(url=url, callback=self.parse)
    
        def parse(self, response):    #网页提取数据,并与mongodb数据集比较,没有相同的数据才从网页抓取。
            count_bingo=0   #数据集中已有记录的条数
            dl = response.css('#list dl dd')     #提取章节链接相关信息
            for dd in dl:
                count_iterator = 0
                self.url_c = self.url_ori + dd.css('a::attr(href)').extract()[0]   #组合形成小说的各章节链接
                #print("网页提取url:", self.url_c)
                self.novelurls=self.novelcollection.find({},{"_id":0,"id":1,"url":1})   #通过重新赋值迭代器来重置迭代器指针,使for循环能够从头遍历迭代器。
                for url in self.novelurls:
                    #print("mongodb提取url:", url)
                    if url["url"]==self.url_c:      #如果数据集中找到与网页提取的url值相同,则跳出循环
                        count_bingo += 1
                        count_iterator += 1
                        break
                if count_iterator != 0 :            #如果有命中结果,则继续下一个循环,不执行爬取动作
                   continue
                print("爬取url:",self.url_c)
                #yield scrapy.Request(self.url_c, callback=self.parse_c,dont_filter=True)
                yield scrapy.Request(self.url_c, callback=self.parse_c)    #以生成器模式(yield)调用parse_c方法获得各章节链接、上一页链接、下一页链接和章节内容信息。
                #print(self.url_c)
            print("数据集已有记录数count_bingo:",count_bingo)
    
        def parse_c(self, response):
            self.item['id'] += 1
            self.item['url'] = response.url
            self.item['preview_page'] = self.url_ori + response.css('div .bottem1 a::attr(href)').extract()[1]
            self.item['next_page'] = self.url_ori + response.css('div .bottem1 a::attr(href)').extract()[3]
            title = response.css('.con_top::text').extract()[4]
            contents = response.css('#content::text').extract()
            text=''
            for content in contents:
                text = text + content
            #print(text)
            self.item['content'] = title + "
    " + text.replace('15', '
    ')     #各章节标题和内容组合成content数据,15是^M的八进制表示,需要替换为换行符。
            yield self.item     #以生成器模式(yield)输出Item对象的内容给pipelines模块。
    
            if self.item['url'][32:39] == self.item['next_page'][32:39]: #同一章有分页的处理
                self.url_c = self.item['next_page']
                yield scrapy.Request(self.url_c, callback=self.parse_c)
    

      

  • 相关阅读:
    爱牛网站
    阿里云文档
    Linux下iptables屏蔽IP和端口号
    oracle 启动监听报错TNS-12547: TNS:lost contact
    http://www.opensymphony.com/
    jmx
    联想笔记本装win7
    包解析
    js为字符串编码
    JAVA版本号微信公众账号开源项目版本号公布-jeewx1.0(捷微)
  • 原文地址:https://www.cnblogs.com/sfccl/p/14821652.html
Copyright © 2011-2022 走看看