1.安装
pip install Scrapy
#一定要以管理员身份运行dos窗口
conda install scrapy
2.创建项目
scrapy startproject novel
3.创建qidianClass4.py文件,爬取小说一级分类,二级分类,名称和链接,分别存入mongdb和redis库中对应表中
import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
# from urllib.request import urlopen
from scrapy.http import Request
# from hello.items import ZhaopinItem
# from scrapy.spiders import CrawlSpider, Rule
# from scrapy.linkextractors import LinkExtractor
from urllib.request import urlopen
#from urllib.request import Request
from bs4 import BeautifulSoup
from lxml import etree
from bson.objectid import ObjectId
import pymongo
client = pymongo.MongoClient(host="127.0.0.1")
db = client.novel #库名dianping
collection = db.novelclass
import redis
r = redis.Redis(host='127.0.0.1', port=6379, db=0)
class qidianClassSpider(scrapy.Spider):
name = "qidianClass4"
allowed_domains = ["qidian.com"] # 允许访问的域
start_urls = [
"https://www.qidian.com/all",
]
# #每爬完一个网页会回调parse方法
# def parse(self, response):
# print(response.body.decode('utf-8'))
def parse(self, response):
hxs = HtmlXPathSelector(response)
hxsObj = hxs.select('//div[@class="work-filter type-filter"]/ul[@type="category"]/li[@class=""]/a')
for secItem in hxsObj:
className = secItem.select('text()').extract()
classUrl = secItem.select('@href').extract()
classUrl = 'https:' + classUrl[0]
print(className[0])
print(classUrl)
classid = self.insertMongo(className[0],None)
request = Request(classUrl, callback=lambda response, pid=str(classid): self.parse_subClass(response, pid))
yield request
print("======================")
def parse_subClass(self, response,pid):
hxs = HtmlXPathSelector(response)
hxsObj = hxs.select('//div[@class="sub-type"]/dl[@class=""]/dd[@class=""]/a')
for secItem in hxsObj:
className2 = secItem.select('text()').extract()
classUrl2 = secItem.select('@href').extract()
print(className2)
print('----------------------------')
classUrl2 = 'https:' + classUrl2[0]
print(classUrl2)
classid = self.insertMongo(className2[0], ObjectId(pid))
self.pushRedis(classid, pid, classUrl2)
def insertMongo(self, classname, pid):
classid = collection.insert({'classname': classname, 'pid': pid})
return classid
def pushRedis(self, classid, pid, url):
novelurl = '%s,%s,%s' % (classid, pid, url)
r.lpush('novelurl', novelurl)
4..创建qidianNovel.py文件,爬取小说名称和链接,分别存入mongdb和redis库中对应表中
# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import HtmlXPathSelector
# from scrapy.http import Request
# from urllib.request import urlopen
from scrapy.http import Request
# from hello.items import ZhaopinItem
# from scrapy.spiders import CrawlSpider, Rule
from time import sleep
# from scrapy.linkextractors import LinkExtractor
import pymongo
client = pymongo.MongoClient(host="127.0.0.1")
db = client.novel # 库名dianping
collection = db.novelname
import redis # 导入redis数据库
r = redis.Redis(host='127.0.0.1', port=6379, db=0)
ii = 0
class qidianNovelSpider(scrapy.Spider):
name = "qidianNovel"
allowed_domains = ["qidian.com"] # 允许访问的域
def __init__(self):
# global pid
# 查询reids库novelurl
# qidianNovelSpider.start_urls=["https://www.qidian.com/all",]
start_urls = []
urlList = r.lrange('novelurl', 0, -1)
ii = 0
self.dict = {}
for item in urlList:
itemStr = str(item, encoding="utf-8")
arr = itemStr.split(',')
classid = arr[0]
pid = arr[1]
url = arr[2]
start_urls.append(url)
self.dict[url] = {"classid": classid, "pid": pid, "num": 0}
# ii += 1
# if ii > 3:
# break
print(start_urls)
self.start_urls = start_urls
def parse(self, response):
classInfo = self.dict[response.url]
objectid = classInfo['classid']
pid = classInfo['pid']
num = classInfo['num']
if num > 3:
return None
hxs = HtmlXPathSelector(response)
hxsObj = hxs.select('//div[@class="book-mid-info"]/h4/a')
for secItem in hxsObj:
className = secItem.select('text()').extract()
classUrl = secItem.select('@href').extract()
classUrl = 'https:' + classUrl[0]
print(className[0])
print(classUrl)
classid =self.insertMongo(className[0],objectid)
self.pushRedis(classid,objectid, classUrl)
nextPage = self.nextUrl(response)
# sleep(0.3)
# --------------------------不用调用方法直接取下一页------------------------------------------------------------------------------
# nextPages= hxs.select('//li[@class="lbf-pagination-item"]/a[@class="lbf-pagination-next "]')
# nextPages = nextPages.select('@href').extract()
# nextPage = "https:" + nextPages[0]
classInfo['num'] += 1
self.dict[nextPage] = classInfo
request = Request(nextPage, callback=self.parse)
yield request
print('--------end--------------')
# ---------------------------------------------------------------------------------------------------------------
# ===================获取下一页链接方法=======================================================
def nextUrl(self, response):
hxs = HtmlXPathSelector(response)
# nextPage = hxs.select('//li[@class="lbf-pagination-item"]/a[@class="lbf-pagination-next "]')
nextPage = hxs.select('//a[@class="lbf-pagination-next "]')
# print(nextPage.extract())
if len(nextPage) == 1:
nextPage = nextPage.select('@href').extract()
nextPage = "https:" + nextPage[0]
print('==============' + nextPage + '====================')
return nextPage
# =====================获取下一页链接结束==================================================
def insertMongo(self, className, pid):
classid = collection.insert({'classname': className, 'pid': pid})
return classid
def pushRedis(self, classid, pid, classUrl):
novelnameurl = '%s,%s,%s,' % (classid, pid, classUrl)
r.lpush('novelnameurl', novelnameurl)
5.创建qidianNovelChapterInfo.py文件,爬取小说名称下的章节和链接,分别存入mongdb和redis库的中的对应表中
# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import HtmlXPathSelector
# from scrapy.http import Request
# from urllib.request import urlopen
from scrapy.http import Request
# from hello.items import ZhaopinItem
# from scrapy.spiders import CrawlSpider, Rule
from time import sleep
# from scrapy.linkextractors import LinkExtractor
from lxml import etree
import pymongo
client = pymongo.MongoClient(host="127.0.0.1")
db = client.novel # 库名dianping
collection = db.novelChapterInfo
import redis # 导入redis数据库
r = redis.Redis(host='127.0.0.1', port=6379, db=0)
ii = 0
class qidianNovelSpider(scrapy.Spider):
name = "qidianNovelChapterInfo"
allowed_domains = ["qidian.com"] # 允许访问的域
def __init__(self):
# global pid
# 查询reids库novelurl
# qidianNovelSpider.start_urls=["https://www.qidian.com/all",]
start_urls = []
urlList = r.lrange('novelnameurl', 0, -1)
ii = 0
self.dict = {}
for item in urlList:
itemStr = str(item, encoding="utf-8")
arr = itemStr.split(',')
classid = arr[0]
pid = arr[1]
url = arr[2]
start_urls.append(url)
self.dict[url] = {"classid": classid, "pid": pid, "num": 0}
# ii += 1
# if ii > 1:
# break
print(start_urls)
self.start_urls = start_urls
def parse(self, response):
classInfo = self.dict[response.url]
objectid = classInfo['classid']
pid = classInfo['pid']
# num = classInfo['num']
# if num > 3:
# return None
html = response.body.decode('utf-8')
selector = etree.HTML(html)
novelChapters = selector.xpath('//ul[@class="cf"]/li/a')
for item in novelChapters:
novelChapter= item.text
print(item.text)
novelChapterUrl='https:'+item.get('href')
print(novelChapterUrl)
# print(item.get('href'))
classid = self.insertMongo(novelChapter, objectid)
self.pushRedis(classid, objectid, novelChapterUrl)
def insertMongo(self,novelChapter, pid):
classid = collection.insert({'novelChapter': novelChapter,'pid': pid})
return classid
def pushRedis(self, classid,pid, novelChapterUrl):
novelChapterUrl = '%s,%s,%s' % ( classid , pid, novelChapterUrl)
r.lpush('novelChapterUrl', novelChapterUrl)
6.创建qidianNovelWorksInfo.py文件,爬取小说基本信息,更新到原有的存小说名称的mongdb(novel)库小说名称表中
# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import HtmlXPathSelector
# from scrapy.http import Request
# from urllib.request import urlopen
from scrapy.http import Request
# from hello.items import ZhaopinItem
# from scrapy.spiders import CrawlSpider, Rule
from time import sleep
# from scrapy.linkextractors import LinkExtractor
from lxml import etree
import pymongo
from bson.objectid import ObjectId
client = pymongo.MongoClient(host="127.0.0.1")
db = client.novel # 库名dianping
collection = db.novelname
import redis # 导入redis数据库
r = redis.Redis(host='127.0.0.1', port=6379, db=0)
ii = 0
class qidianNovelSpider(scrapy.Spider):
name = "qidianNovelWorksInfo"
allowed_domains = ["qidian.com"] # 允许访问的域
def __init__(self):
# global pid
# 查询reids库novelurl
# qidianNovelSpider.start_urls=["https://www.qidian.com/all",]
start_urls = []
urlList = r.lrange('novelnameurl', 0, -1)
ii = 0
self.dict = {}
for item in urlList:
itemStr = str(item, encoding="utf-8")
arr = itemStr.split(',')
classid = arr[0]
pid = arr[1]
url = arr[2]
start_urls.append(url)
self.dict[url] = {"classid": classid, "pid": pid, "num": 0}
# ii += 1
# if ii > 5:
# break
print(start_urls)
self.start_urls = start_urls
def parse(self, response):
classInfo = self.dict[response.url]
objectid = classInfo['classid']
objectid2 = ObjectId(objectid)
pid = classInfo['pid']
# num = classInfo['num']
# if num > 3:
# return None
html = response.body.decode('utf-8')
selector = etree.HTML(html)
workName = selector.xpath('//div[@class="book-info "]/h1/span/a[@class="writer"]/text()')
novelName = selector.xpath('//div[@class="book-info "]/h1/em/text()')
novelState = selector.xpath('//div[@class="book-info "]/p[@class="tag"]/span[@class="blue"]/text()')
novelClass = selector.xpath('//div[@class="book-info "]/p[@class="tag"]/a[@class="red"]/text()')
objClass=novelClass[0]
sonClass=novelClass[1]
print("小说名:"+novelName[0])
print("作者名:"+workName[0])
print("状态:" + novelState[0])
print("小说分类:"+objClass)
print("小说分类2:" + sonClass)
db.novelname.update({"_id": objectid2}, {"$set": {'workName': workName, 'novelName': novelName, 'novelState': novelState, 'objClass': objClass,'sonClass': sonClass}})
print('--------end--------------')
# ---------------------------------------------------------------------------------------------------------------
# def updateMongo(self, workName,novelName,novelState,objClass,sonClass,objectid2):
# # classid = collection.update({'workName': workName,'novelName':novelName,'novelState':novelState,'objClass':objClass,'sonClass':sonClass,'pid': pid})
# classid = collection.update({"_id":objectid2 },{"$set":{'workName': workName, 'novelName': novelName, 'novelState': novelState, 'objClass': objClass, 'sonClass': sonClass}})
# return classid
7.创建qidianNovelChapterContent.py文件,爬取小说章节内容,更新到原有的存小说章节的mongdb(novel)库下章节表
# -*- coding: utf-8 -*-
import scrapy
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from scrapy.selector import HtmlXPathSelector
# from scrapy.http import Request
# from urllib.request import urlopen
from scrapy.http import Request
# from hello.items import ZhaopinItem
# from scrapy.spiders import CrawlSpider, Rule
from time import sleep
# from scrapy.linkextractors import LinkExtractor
from lxml import etree
import pymongo
from bson.objectid import ObjectId
client = pymongo.MongoClient(host="127.0.0.1")
db = client.novel # 库名dianping
collection = db.novelChapterInfo
import redis # 导入redis数据库
r = redis.Redis(host='127.0.0.1', port=6379, db=0)
ii = 0
class qidianNovelSpider(scrapy.Spider):
name = "qidianNovelChapterContent"
allowed_domains = ["qidian.com"] # 允许访问的域
def __init__(self):
# global pid
# 查询reids库novelurl
#qidianNovelSpider.start_urls=["https://read.qidian.com/chapter/kbE0tc0oVoNrZK4x-CuJuw2/92LFs_xdtPXwrjbX3WA1AA2",]
start_urls = []
urlList = r.lrange('novelChapterUrl', 0,-1)
ii = 0
self.dict = {}
for item in urlList:
itemStr = str(item, encoding="utf-8")
arr = itemStr.split(',')
classid = arr[0]
pid = arr[1]
url = arr[2]
start_urls.append(url)
self.dict[url] = {"classid": classid, "pid": pid, "num": 0}
# ii += 1
# if ii > 10:
# break
# print(start_urls)
self.start_urls = start_urls
def parse(self, response):
classInfo = self.dict[response.url]
objectid = classInfo['classid']
objectid2 = ObjectId(objectid)
pid = classInfo['pid']
num = classInfo['num']
ii = ""
#==================================================================================
html = response.body.decode('utf-8')
selector = etree.HTML(html)
novelChaptersContents = selector.xpath('//div[@class ="read-content j_readContent"]/p')
# print(novelChaptersContent)
for item in novelChaptersContents:
novelChaptersContent=item.text
# print(novelChaptersContent)
ii = novelChaptersContent + ii
# classid = collection.insert({'content': ii, 'pid': pid})
db.novelChapterInfo.update({"_id": objectid2}, {"$set": {'novelChaptersContent':ii}})
# sleep(0.3)
print('------------------------------------------------------')
# ---------------------------------------------------------------------------------------------------------------
# def nextChapter(self, response):
# hxs = HtmlXPathSelector(response)
# nextChapter = hxs.select('//div[@"chapter-control dib-wrap"]/a[@id = "j_chapterNext"]')
# # print(nextPage.extract())
# if len(nextChapter) == 1:
# nextChapter = nextChapter.select('@href').extract()
# nextChapter= "https:" + nextChapter[0]
# print('==============' + nextChapter + '====================')
# return nextChapter
9.运行,在项目根目录下dos执行:
scrapy crawl dmoz(对应py文件中的name=" ")
最近一直忙于手中的项目,一直没有整理,抱歉
中