zoukankan html css js c++ java

爬取起点小说网（二）设计代码

1.安装
pip install Scrapy
#一定要以管理员身份运行dos窗口
conda install scrapy
2.创建项目
scrapy startproject novel

3.创建qidianClass4.py文件，爬取小说一级分类，二级分类，名称和链接，分别存入mongdb和redis库中对应表中

import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
# from urllib.request import urlopen
from scrapy.http import Request
# from hello.items import ZhaopinItem
# from scrapy.spiders import CrawlSpider, Rule
# from scrapy.linkextractors import LinkExtractor
from urllib.request import urlopen
#from urllib.request import Request
from bs4 import BeautifulSoup
from lxml import etree
from bson.objectid import ObjectId
import pymongo
client = pymongo.MongoClient(host="127.0.0.1")
db = client.novel            #库名dianping
collection = db.novelclass

import redis
r = redis.Redis(host='127.0.0.1', port=6379, db=0)


class qidianClassSpider(scrapy.Spider):
    name = "qidianClass4"
    allowed_domains = ["qidian.com"]  # 允许访问的域
    start_urls = [
        "https://www.qidian.com/all",
    ]

    # #每爬完一个网页会回调parse方法
    # def parse(self, response):
    #     print(response.body.decode('utf-8'))
    def parse(self, response):

        hxs = HtmlXPathSelector(response)
        hxsObj = hxs.select('//div[@class="work-filter type-filter"]/ul[@type="category"]/li[@class=""]/a')
        for secItem in hxsObj:
            className = secItem.select('text()').extract()
            classUrl = secItem.select('@href').extract()
            classUrl = 'https:' + classUrl[0]
            print(className[0])
            print(classUrl)
            classid = self.insertMongo(className[0],None)
            request = Request(classUrl, callback=lambda response, pid=str(classid): self.parse_subClass(response, pid))
            yield request
            print("======================")
    def parse_subClass(self, response,pid):

        hxs = HtmlXPathSelector(response)
        hxsObj = hxs.select('//div[@class="sub-type"]/dl[@class=""]/dd[@class=""]/a')
        for secItem in hxsObj:
            className2 = secItem.select('text()').extract()
            classUrl2 = secItem.select('@href').extract()
            print(className2)
            print('----------------------------')
            classUrl2 = 'https:' + classUrl2[0]
            print(classUrl2)
            classid = self.insertMongo(className2[0], ObjectId(pid))
            self.pushRedis(classid, pid, classUrl2)

    def insertMongo(self, classname, pid):
        classid = collection.insert({'classname': classname, 'pid': pid})
        return classid

    def pushRedis(self, classid, pid, url):
        novelurl = '%s,%s,%s' % (classid, pid, url)
        r.lpush('novelurl', novelurl)

　　4..创建qidianNovel.py文件，爬取小说名称和链接，分别存入mongdb和redis库中对应表中

# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import HtmlXPathSelector
# from scrapy.http import Request
# from urllib.request import urlopen
from scrapy.http import Request
# from hello.items import ZhaopinItem
# from scrapy.spiders import CrawlSpider, Rule
from time import sleep
# from scrapy.linkextractors import LinkExtractor

import pymongo

client = pymongo.MongoClient(host="127.0.0.1")
db = client.novel  # 库名dianping
collection = db.novelname

import redis  # 导入redis数据库

r = redis.Redis(host='127.0.0.1', port=6379, db=0)

ii = 0


class qidianNovelSpider(scrapy.Spider):
    name = "qidianNovel"
    allowed_domains = ["qidian.com"]  # 允许访问的域

    def __init__(self):
        # global pid
        # 查询reids库novelurl
        # qidianNovelSpider.start_urls=["https://www.qidian.com/all",]
        start_urls = []
        urlList = r.lrange('novelurl', 0, -1)
        ii = 0
        self.dict = {}
        for item in urlList:
            itemStr = str(item, encoding="utf-8")
            arr = itemStr.split(',')
            classid = arr[0]
            pid = arr[1]
            url = arr[2]
            start_urls.append(url)
            self.dict[url] = {"classid": classid, "pid": pid, "num": 0}
            # ii += 1
            # if ii > 3:
            #     break
        print(start_urls)
        self.start_urls = start_urls

    def parse(self, response):
        classInfo = self.dict[response.url]
        objectid = classInfo['classid']
        pid = classInfo['pid']
        num = classInfo['num']
        if num > 3:
            return None
        hxs = HtmlXPathSelector(response)
        hxsObj = hxs.select('//div[@class="book-mid-info"]/h4/a')
        for secItem in hxsObj:
            className = secItem.select('text()').extract()
            classUrl = secItem.select('@href').extract()
            classUrl = 'https:' + classUrl[0]
            print(className[0])
            print(classUrl)
            classid =self.insertMongo(className[0],objectid)
            self.pushRedis(classid,objectid, classUrl)

        nextPage = self.nextUrl(response)
            # sleep(0.3)
            # --------------------------不用调用方法直接取下一页------------------------------------------------------------------------------
        # nextPages= hxs.select('//li[@class="lbf-pagination-item"]/a[@class="lbf-pagination-next "]')
        # nextPages = nextPages.select('@href').extract()
        # nextPage = "https:" + nextPages[0]

        classInfo['num'] += 1
        self.dict[nextPage] = classInfo
        request = Request(nextPage, callback=self.parse)
        yield request
        print('--------end--------------')
# ---------------------------------------------------------------------------------------------------------------
# ===================获取下一页链接方法=======================================================
    def nextUrl(self, response):
        hxs = HtmlXPathSelector(response)
        # nextPage = hxs.select('//li[@class="lbf-pagination-item"]/a[@class="lbf-pagination-next "]')
        nextPage = hxs.select('//a[@class="lbf-pagination-next "]')
        # print(nextPage.extract())
        if len(nextPage) == 1:
            nextPage = nextPage.select('@href').extract()
            nextPage = "https:" + nextPage[0]

            print('==============' + nextPage + '====================')
            return nextPage

            # =====================获取下一页链接结束==================================================


    def insertMongo(self, className, pid):
        classid = collection.insert({'classname': className, 'pid': pid})
        return classid


    def pushRedis(self, classid, pid, classUrl):
        novelnameurl = '%s,%s,%s,' % (classid, pid, classUrl)
        r.lpush('novelnameurl', novelnameurl)

　　5.创建qidianNovelChapterInfo.py文件，爬取小说名称下的章节和链接，分别存入mongdb和redis库的中的对应表中

# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import HtmlXPathSelector
# from scrapy.http import Request
# from urllib.request import urlopen
from scrapy.http import Request
# from hello.items import ZhaopinItem
# from scrapy.spiders import CrawlSpider, Rule
from time import sleep
# from scrapy.linkextractors import LinkExtractor
from lxml import etree
import pymongo

client = pymongo.MongoClient(host="127.0.0.1")
db = client.novel  # 库名dianping
collection = db.novelChapterInfo

import redis  # 导入redis数据库

r = redis.Redis(host='127.0.0.1', port=6379, db=0)

ii = 0


class qidianNovelSpider(scrapy.Spider):
    name = "qidianNovelChapterInfo"
    allowed_domains = ["qidian.com"]  # 允许访问的域

    def __init__(self):
        # global pid
        # 查询reids库novelurl
        # qidianNovelSpider.start_urls=["https://www.qidian.com/all",]
        start_urls = []
        urlList = r.lrange('novelnameurl', 0, -1)
        ii = 0
        self.dict = {}
        for item in urlList:
            itemStr = str(item, encoding="utf-8")
            arr = itemStr.split(',')
            classid = arr[0]
            pid = arr[1]
            url = arr[2]
            start_urls.append(url)
            self.dict[url] = {"classid": classid, "pid": pid, "num": 0}
            # ii += 1
            # if ii > 1:
            #     break
        print(start_urls)
        self.start_urls = start_urls

    def parse(self, response):
        classInfo = self.dict[response.url]
        objectid = classInfo['classid']
        pid = classInfo['pid']
        # num = classInfo['num']
        # if num > 3:
        #     return None
        html = response.body.decode('utf-8')
        selector = etree.HTML(html)
        novelChapters = selector.xpath('//ul[@class="cf"]/li/a')
        for item in novelChapters:
            novelChapter= item.text
            print(item.text)
            novelChapterUrl='https:'+item.get('href')
            print(novelChapterUrl)
            # print(item.get('href'))

            classid = self.insertMongo(novelChapter, objectid)
            self.pushRedis(classid, objectid, novelChapterUrl)

    def insertMongo(self,novelChapter, pid):
        classid = collection.insert({'novelChapter': novelChapter,'pid': pid})
        return classid

    def pushRedis(self, classid,pid, novelChapterUrl):
        novelChapterUrl = '%s,%s,%s' % ( classid , pid, novelChapterUrl)
        r.lpush('novelChapterUrl', novelChapterUrl)

　　6.创建qidianNovelWorksInfo.py文件，爬取小说基本信息，更新到原有的存小说名称的mongdb（novel）库小说名称表中

# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import HtmlXPathSelector
# from scrapy.http import Request
# from urllib.request import urlopen
from scrapy.http import Request
# from hello.items import ZhaopinItem
# from scrapy.spiders import CrawlSpider, Rule
from time import sleep
# from scrapy.linkextractors import LinkExtractor
from lxml import etree
import pymongo
from bson.objectid import ObjectId

client = pymongo.MongoClient(host="127.0.0.1")
db = client.novel  # 库名dianping
collection = db.novelname

import redis  # 导入redis数据库

r = redis.Redis(host='127.0.0.1', port=6379, db=0)

ii = 0


class qidianNovelSpider(scrapy.Spider):
    name = "qidianNovelWorksInfo"
    allowed_domains = ["qidian.com"]  # 允许访问的域

    def __init__(self):
        # global pid
        # 查询reids库novelurl
        # qidianNovelSpider.start_urls=["https://www.qidian.com/all",]
        start_urls = []
        urlList = r.lrange('novelnameurl', 0, -1)
        ii = 0
        self.dict = {}
        for item in urlList:
            itemStr = str(item, encoding="utf-8")
            arr = itemStr.split(',')
            classid = arr[0]
            pid = arr[1]
            url = arr[2]
            start_urls.append(url)
            self.dict[url] = {"classid": classid, "pid": pid, "num": 0}
            # ii += 1
            # if ii > 5:
            #     break
        print(start_urls)
        self.start_urls = start_urls

    def parse(self, response):
        classInfo = self.dict[response.url]
        objectid = classInfo['classid']
        objectid2 = ObjectId(objectid)
        pid = classInfo['pid']
        # num = classInfo['num']
        # if num > 3:
        #     return None
        html = response.body.decode('utf-8')
        selector = etree.HTML(html)
        workName = selector.xpath('//div[@class="book-info "]/h1/span/a[@class="writer"]/text()')
        novelName = selector.xpath('//div[@class="book-info "]/h1/em/text()')
        novelState = selector.xpath('//div[@class="book-info "]/p[@class="tag"]/span[@class="blue"]/text()')
        novelClass = selector.xpath('//div[@class="book-info "]/p[@class="tag"]/a[@class="red"]/text()')
        objClass=novelClass[0]
        sonClass=novelClass[1]
        print("小说名："+novelName[0])
        print("作者名："+workName[0])
        print("状态：" + novelState[0])
        print("小说分类："+objClass)
        print("小说分类2：" + sonClass)

        db.novelname.update({"_id": objectid2}, {"$set": {'workName': workName, 'novelName': novelName, 'novelState': novelState, 'objClass': objClass,'sonClass': sonClass}})


        print('--------end--------------')
# ---------------------------------------------------------------------------------------------------------------

# def updateMongo(self, workName,novelName,novelState,objClass,sonClass,objectid2):
    #     # classid = collection.update({'workName': workName,'novelName':novelName,'novelState':novelState,'objClass':objClass,'sonClass':sonClass,'pid': pid})
    #     classid = collection.update({"_id":objectid2 },{"$set":{'workName': workName, 'novelName': novelName, 'novelState': novelState, 'objClass': objClass, 'sonClass': sonClass}})
    #     return classid

　　7.创建qidianNovelChapterContent.py文件，爬取小说章节内容，更新到原有的存小说章节的mongdb（novel）库下章节表

# -*- coding: utf-8 -*-
import scrapy
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

from scrapy.selector import HtmlXPathSelector
# from scrapy.http import Request
# from urllib.request import urlopen
from scrapy.http import Request
# from hello.items import ZhaopinItem
# from scrapy.spiders import CrawlSpider, Rule
from time import sleep
# from scrapy.linkextractors import LinkExtractor
from lxml import etree
import pymongo
from bson.objectid import ObjectId

client = pymongo.MongoClient(host="127.0.0.1")
db = client.novel  # 库名dianping
collection = db.novelChapterInfo

import redis  # 导入redis数据库

r = redis.Redis(host='127.0.0.1', port=6379, db=0)

ii = 0


class qidianNovelSpider(scrapy.Spider):
    name = "qidianNovelChapterContent"
    allowed_domains = ["qidian.com"]  # 允许访问的域

    def __init__(self):
        # global pid
        # 查询reids库novelurl
        #qidianNovelSpider.start_urls=["https://read.qidian.com/chapter/kbE0tc0oVoNrZK4x-CuJuw2/92LFs_xdtPXwrjbX3WA1AA2",]
        start_urls = []
        urlList = r.lrange('novelChapterUrl', 0,-1)
        ii = 0
        self.dict = {}
        for item in urlList:
            itemStr = str(item, encoding="utf-8")
            arr = itemStr.split(',')
            classid = arr[0]
            pid = arr[1]
            url = arr[2]
            start_urls.append(url)
            self.dict[url] = {"classid": classid, "pid": pid, "num": 0}
            # ii += 1
            # if ii > 10:
            #     break
        # print(start_urls)
        self.start_urls = start_urls

    def parse(self, response):
        classInfo = self.dict[response.url]
        objectid = classInfo['classid']
        objectid2 = ObjectId(objectid)
        pid = classInfo['pid']
        num = classInfo['num']
        ii = ""
        #==================================================================================
        html = response.body.decode('utf-8')
        selector = etree.HTML(html)
        novelChaptersContents = selector.xpath('//div[@class ="read-content j_readContent"]/p')
        # print(novelChaptersContent)
        for item in novelChaptersContents:
            novelChaptersContent=item.text
            # print(novelChaptersContent)
            ii = novelChaptersContent + ii
            # classid = collection.insert({'content': ii, 'pid': pid})
            db.novelChapterInfo.update({"_id": objectid2}, {"$set": {'novelChaptersContent':ii}})
        # sleep(0.3)
        print('------------------------------------------------------')

# ---------------------------------------------------------------------------------------------------------------
    # def nextChapter(self, response):
    #     hxs = HtmlXPathSelector(response)
    #     nextChapter = hxs.select('//div[@"chapter-control dib-wrap"]/a[@id = "j_chapterNext"]')
    #     # print(nextPage.extract())
    #     if len(nextChapter) == 1:
    #         nextChapter = nextChapter.select('@href').extract()
    #         nextChapter= "https:" + nextChapter[0]
    #         print('==============' + nextChapter + '====================')
    #         return nextChapter

9.运行,在项目根目录下dos执行:
scrapy crawl dmoz(对应py文件中的name=" ")

最近一直忙于手中的项目，一直没有整理，抱歉

中

查看全文

相关阅读:
第一次博客作业
 面向对象及软件工程-团队作业4
面向对象及软件工程-团队作业3
面向对象及软件工程-团队作业2
面向对象及软件工程-个人作业2（59）
面向对象及软件工程-团队作业1
面向对象及软件工程-个人作业1
数据结构：第八章学习小结
 数据结构：第七章学习小结
 数据结构：第六章学习小结

原文地址：https://www.cnblogs.com/yongxinboy/p/8029125.html