zoukankan      html  css  js  c++  java
  • 爬取起点小说网(二)设计代码

    1.安装
    pip install Scrapy
    #一定要以管理员身份运行dos窗口
    conda install scrapy
    2.创建项目
    scrapy startproject novel

    3.创建qidianClass4.py文件,爬取小说一级分类,二级分类,名称和链接,分别存入mongdb和redis库中对应表中

    import scrapy
    from scrapy.selector import HtmlXPathSelector
    from scrapy.http import Request
    # from urllib.request import urlopen
    from scrapy.http import Request
    # from hello.items import ZhaopinItem
    # from scrapy.spiders import CrawlSpider, Rule
    # from scrapy.linkextractors import LinkExtractor
    from urllib.request import urlopen
    #from urllib.request import Request
    from bs4 import BeautifulSoup
    from lxml import etree
    from bson.objectid import ObjectId
    import pymongo
    client = pymongo.MongoClient(host="127.0.0.1")
    db = client.novel            #库名dianping
    collection = db.novelclass
    
    import redis
    r = redis.Redis(host='127.0.0.1', port=6379, db=0)
    
    
    class qidianClassSpider(scrapy.Spider):
        name = "qidianClass4"
        allowed_domains = ["qidian.com"]  # 允许访问的域
        start_urls = [
            "https://www.qidian.com/all",
        ]
    
        # #每爬完一个网页会回调parse方法
        # def parse(self, response):
        #     print(response.body.decode('utf-8'))
        def parse(self, response):
    
            hxs = HtmlXPathSelector(response)
            hxsObj = hxs.select('//div[@class="work-filter type-filter"]/ul[@type="category"]/li[@class=""]/a')
            for secItem in hxsObj:
                className = secItem.select('text()').extract()
                classUrl = secItem.select('@href').extract()
                classUrl = 'https:' + classUrl[0]
                print(className[0])
                print(classUrl)
                classid = self.insertMongo(className[0],None)
                request = Request(classUrl, callback=lambda response, pid=str(classid): self.parse_subClass(response, pid))
                yield request
                print("======================")
        def parse_subClass(self, response,pid):
    
            hxs = HtmlXPathSelector(response)
            hxsObj = hxs.select('//div[@class="sub-type"]/dl[@class=""]/dd[@class=""]/a')
            for secItem in hxsObj:
                className2 = secItem.select('text()').extract()
                classUrl2 = secItem.select('@href').extract()
                print(className2)
                print('----------------------------')
                classUrl2 = 'https:' + classUrl2[0]
                print(classUrl2)
                classid = self.insertMongo(className2[0], ObjectId(pid))
                self.pushRedis(classid, pid, classUrl2)
    
        def insertMongo(self, classname, pid):
            classid = collection.insert({'classname': classname, 'pid': pid})
            return classid
    
        def pushRedis(self, classid, pid, url):
            novelurl = '%s,%s,%s' % (classid, pid, url)
            r.lpush('novelurl', novelurl)
    

      4..创建qidianNovel.py文件,爬取小说名称和链接,分别存入mongdb和redis库中对应表中

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.selector import HtmlXPathSelector
    # from scrapy.http import Request
    # from urllib.request import urlopen
    from scrapy.http import Request
    # from hello.items import ZhaopinItem
    # from scrapy.spiders import CrawlSpider, Rule
    from time import sleep
    # from scrapy.linkextractors import LinkExtractor
    
    import pymongo
    
    client = pymongo.MongoClient(host="127.0.0.1")
    db = client.novel  # 库名dianping
    collection = db.novelname
    
    import redis  # 导入redis数据库
    
    r = redis.Redis(host='127.0.0.1', port=6379, db=0)
    
    ii = 0
    
    
    class qidianNovelSpider(scrapy.Spider):
        name = "qidianNovel"
        allowed_domains = ["qidian.com"]  # 允许访问的域
    
        def __init__(self):
            # global pid
            # 查询reids库novelurl
            # qidianNovelSpider.start_urls=["https://www.qidian.com/all",]
            start_urls = []
            urlList = r.lrange('novelurl', 0, -1)
            ii = 0
            self.dict = {}
            for item in urlList:
                itemStr = str(item, encoding="utf-8")
                arr = itemStr.split(',')
                classid = arr[0]
                pid = arr[1]
                url = arr[2]
                start_urls.append(url)
                self.dict[url] = {"classid": classid, "pid": pid, "num": 0}
                # ii += 1
                # if ii > 3:
                #     break
            print(start_urls)
            self.start_urls = start_urls
    
        def parse(self, response):
            classInfo = self.dict[response.url]
            objectid = classInfo['classid']
            pid = classInfo['pid']
            num = classInfo['num']
            if num > 3:
                return None
            hxs = HtmlXPathSelector(response)
            hxsObj = hxs.select('//div[@class="book-mid-info"]/h4/a')
            for secItem in hxsObj:
                className = secItem.select('text()').extract()
                classUrl = secItem.select('@href').extract()
                classUrl = 'https:' + classUrl[0]
                print(className[0])
                print(classUrl)
                classid =self.insertMongo(className[0],objectid)
                self.pushRedis(classid,objectid, classUrl)
    
            nextPage = self.nextUrl(response)
                # sleep(0.3)
                # --------------------------不用调用方法直接取下一页------------------------------------------------------------------------------
            # nextPages= hxs.select('//li[@class="lbf-pagination-item"]/a[@class="lbf-pagination-next "]')
            # nextPages = nextPages.select('@href').extract()
            # nextPage = "https:" + nextPages[0]
    
            classInfo['num'] += 1
            self.dict[nextPage] = classInfo
            request = Request(nextPage, callback=self.parse)
            yield request
            print('--------end--------------')
    # ---------------------------------------------------------------------------------------------------------------
    # ===================获取下一页链接方法=======================================================
        def nextUrl(self, response):
            hxs = HtmlXPathSelector(response)
            # nextPage = hxs.select('//li[@class="lbf-pagination-item"]/a[@class="lbf-pagination-next "]')
            nextPage = hxs.select('//a[@class="lbf-pagination-next "]')
            # print(nextPage.extract())
            if len(nextPage) == 1:
                nextPage = nextPage.select('@href').extract()
                nextPage = "https:" + nextPage[0]
    
                print('==============' + nextPage + '====================')
                return nextPage
    
                # =====================获取下一页链接结束==================================================
    
    
        def insertMongo(self, className, pid):
            classid = collection.insert({'classname': className, 'pid': pid})
            return classid
    
    
        def pushRedis(self, classid, pid, classUrl):
            novelnameurl = '%s,%s,%s,' % (classid, pid, classUrl)
            r.lpush('novelnameurl', novelnameurl)
    

      5.创建qidianNovelChapterInfo.py文件,爬取小说名称下的章节和链接,分别存入mongdb和redis库的中的对应表中

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.selector import HtmlXPathSelector
    # from scrapy.http import Request
    # from urllib.request import urlopen
    from scrapy.http import Request
    # from hello.items import ZhaopinItem
    # from scrapy.spiders import CrawlSpider, Rule
    from time import sleep
    # from scrapy.linkextractors import LinkExtractor
    from lxml import etree
    import pymongo
    
    client = pymongo.MongoClient(host="127.0.0.1")
    db = client.novel  # 库名dianping
    collection = db.novelChapterInfo
    
    import redis  # 导入redis数据库
    
    r = redis.Redis(host='127.0.0.1', port=6379, db=0)
    
    ii = 0
    
    
    class qidianNovelSpider(scrapy.Spider):
        name = "qidianNovelChapterInfo"
        allowed_domains = ["qidian.com"]  # 允许访问的域
    
        def __init__(self):
            # global pid
            # 查询reids库novelurl
            # qidianNovelSpider.start_urls=["https://www.qidian.com/all",]
            start_urls = []
            urlList = r.lrange('novelnameurl', 0, -1)
            ii = 0
            self.dict = {}
            for item in urlList:
                itemStr = str(item, encoding="utf-8")
                arr = itemStr.split(',')
                classid = arr[0]
                pid = arr[1]
                url = arr[2]
                start_urls.append(url)
                self.dict[url] = {"classid": classid, "pid": pid, "num": 0}
                # ii += 1
                # if ii > 1:
                #     break
            print(start_urls)
            self.start_urls = start_urls
    
        def parse(self, response):
            classInfo = self.dict[response.url]
            objectid = classInfo['classid']
            pid = classInfo['pid']
            # num = classInfo['num']
            # if num > 3:
            #     return None
            html = response.body.decode('utf-8')
            selector = etree.HTML(html)
            novelChapters = selector.xpath('//ul[@class="cf"]/li/a')
            for item in novelChapters:
                novelChapter= item.text
                print(item.text)
                novelChapterUrl='https:'+item.get('href')
                print(novelChapterUrl)
                # print(item.get('href'))
    
                classid = self.insertMongo(novelChapter, objectid)
                self.pushRedis(classid, objectid, novelChapterUrl)
    
        def insertMongo(self,novelChapter, pid):
            classid = collection.insert({'novelChapter': novelChapter,'pid': pid})
            return classid
    
        def pushRedis(self, classid,pid, novelChapterUrl):
            novelChapterUrl = '%s,%s,%s' % ( classid , pid, novelChapterUrl)
            r.lpush('novelChapterUrl', novelChapterUrl)
    

      6.创建qidianNovelWorksInfo.py文件,爬取小说基本信息,更新到原有的存小说名称的mongdb(novel)库小说名称表中

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.selector import HtmlXPathSelector
    # from scrapy.http import Request
    # from urllib.request import urlopen
    from scrapy.http import Request
    # from hello.items import ZhaopinItem
    # from scrapy.spiders import CrawlSpider, Rule
    from time import sleep
    # from scrapy.linkextractors import LinkExtractor
    from lxml import etree
    import pymongo
    from bson.objectid import ObjectId
    
    client = pymongo.MongoClient(host="127.0.0.1")
    db = client.novel  # 库名dianping
    collection = db.novelname
    
    import redis  # 导入redis数据库
    
    r = redis.Redis(host='127.0.0.1', port=6379, db=0)
    
    ii = 0
    
    
    class qidianNovelSpider(scrapy.Spider):
        name = "qidianNovelWorksInfo"
        allowed_domains = ["qidian.com"]  # 允许访问的域
    
        def __init__(self):
            # global pid
            # 查询reids库novelurl
            # qidianNovelSpider.start_urls=["https://www.qidian.com/all",]
            start_urls = []
            urlList = r.lrange('novelnameurl', 0, -1)
            ii = 0
            self.dict = {}
            for item in urlList:
                itemStr = str(item, encoding="utf-8")
                arr = itemStr.split(',')
                classid = arr[0]
                pid = arr[1]
                url = arr[2]
                start_urls.append(url)
                self.dict[url] = {"classid": classid, "pid": pid, "num": 0}
                # ii += 1
                # if ii > 5:
                #     break
            print(start_urls)
            self.start_urls = start_urls
    
        def parse(self, response):
            classInfo = self.dict[response.url]
            objectid = classInfo['classid']
            objectid2 = ObjectId(objectid)
            pid = classInfo['pid']
            # num = classInfo['num']
            # if num > 3:
            #     return None
            html = response.body.decode('utf-8')
            selector = etree.HTML(html)
            workName = selector.xpath('//div[@class="book-info "]/h1/span/a[@class="writer"]/text()')
            novelName = selector.xpath('//div[@class="book-info "]/h1/em/text()')
            novelState = selector.xpath('//div[@class="book-info "]/p[@class="tag"]/span[@class="blue"]/text()')
            novelClass = selector.xpath('//div[@class="book-info "]/p[@class="tag"]/a[@class="red"]/text()')
            objClass=novelClass[0]
            sonClass=novelClass[1]
            print("小说名:"+novelName[0])
            print("作者名:"+workName[0])
            print("状态:" + novelState[0])
            print("小说分类:"+objClass)
            print("小说分类2:" + sonClass)
    
            db.novelname.update({"_id": objectid2}, {"$set": {'workName': workName, 'novelName': novelName, 'novelState': novelState, 'objClass': objClass,'sonClass': sonClass}})
    
    
            print('--------end--------------')
    # ---------------------------------------------------------------------------------------------------------------
    
    # def updateMongo(self, workName,novelName,novelState,objClass,sonClass,objectid2):
        #     # classid = collection.update({'workName': workName,'novelName':novelName,'novelState':novelState,'objClass':objClass,'sonClass':sonClass,'pid': pid})
        #     classid = collection.update({"_id":objectid2 },{"$set":{'workName': workName, 'novelName': novelName, 'novelState': novelState, 'objClass': objClass, 'sonClass': sonClass}})
        #     return classid
    

      7.创建qidianNovelChapterContent.py文件,爬取小说章节内容,更新到原有的存小说章节的mongdb(novel)库下章节表

    # -*- coding: utf-8 -*-
    import scrapy
    from selenium import webdriver
    from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
    
    from scrapy.selector import HtmlXPathSelector
    # from scrapy.http import Request
    # from urllib.request import urlopen
    from scrapy.http import Request
    # from hello.items import ZhaopinItem
    # from scrapy.spiders import CrawlSpider, Rule
    from time import sleep
    # from scrapy.linkextractors import LinkExtractor
    from lxml import etree
    import pymongo
    from bson.objectid import ObjectId
    
    client = pymongo.MongoClient(host="127.0.0.1")
    db = client.novel  # 库名dianping
    collection = db.novelChapterInfo
    
    import redis  # 导入redis数据库
    
    r = redis.Redis(host='127.0.0.1', port=6379, db=0)
    
    ii = 0
    
    
    class qidianNovelSpider(scrapy.Spider):
        name = "qidianNovelChapterContent"
        allowed_domains = ["qidian.com"]  # 允许访问的域
    
        def __init__(self):
            # global pid
            # 查询reids库novelurl
            #qidianNovelSpider.start_urls=["https://read.qidian.com/chapter/kbE0tc0oVoNrZK4x-CuJuw2/92LFs_xdtPXwrjbX3WA1AA2",]
            start_urls = []
            urlList = r.lrange('novelChapterUrl', 0,-1)
            ii = 0
            self.dict = {}
            for item in urlList:
                itemStr = str(item, encoding="utf-8")
                arr = itemStr.split(',')
                classid = arr[0]
                pid = arr[1]
                url = arr[2]
                start_urls.append(url)
                self.dict[url] = {"classid": classid, "pid": pid, "num": 0}
                # ii += 1
                # if ii > 10:
                #     break
            # print(start_urls)
            self.start_urls = start_urls
    
        def parse(self, response):
            classInfo = self.dict[response.url]
            objectid = classInfo['classid']
            objectid2 = ObjectId(objectid)
            pid = classInfo['pid']
            num = classInfo['num']
            ii = ""
            #==================================================================================
            html = response.body.decode('utf-8')
            selector = etree.HTML(html)
            novelChaptersContents = selector.xpath('//div[@class ="read-content j_readContent"]/p')
            # print(novelChaptersContent)
            for item in novelChaptersContents:
                novelChaptersContent=item.text
                # print(novelChaptersContent)
                ii = novelChaptersContent + ii
                # classid = collection.insert({'content': ii, 'pid': pid})
                db.novelChapterInfo.update({"_id": objectid2}, {"$set": {'novelChaptersContent':ii}})
            # sleep(0.3)
            print('------------------------------------------------------')
    
    # ---------------------------------------------------------------------------------------------------------------
        # def nextChapter(self, response):
        #     hxs = HtmlXPathSelector(response)
        #     nextChapter = hxs.select('//div[@"chapter-control dib-wrap"]/a[@id = "j_chapterNext"]')
        #     # print(nextPage.extract())
        #     if len(nextChapter) == 1:
        #         nextChapter = nextChapter.select('@href').extract()
        #         nextChapter= "https:" + nextChapter[0]
        #         print('==============' + nextChapter + '====================')
        #         return nextChapter 

    9.运行,在项目根目录下dos执行:
    scrapy crawl dmoz(对应py文件中的name=" ")

    最近一直忙于手中的项目,一直没有整理,抱歉

  • 相关阅读:
    SQL常用关键字
    SQL 笔记1,left join,group by,having
    SpringIDE的安装
    一些有用的书籍,也许需要看看
    执行力
    Q12问题
    WebCollector Cookbook (WebCollector中文文档):
    SSM配置文件
    wkhtmltopdf Windows下 测试demo 成功
    html2pdf 中文支持问题
  • 原文地址:https://www.cnblogs.com/yongxinboy/p/8029125.html
Copyright © 2011-2022 走看看