zoukankan html css js c++ java

小说下载脚本

================================

工具准备:

================================
下载与 chome 浏览器版本一致的 chromedriver, chromedriver 国内下载镜像
https://npm.taobao.org/mirrors/chromedriver

将 chromedriver.exe 复制到 python 的scripts目录中, 比如 C:Anaconda3Scripts
并将C:Anaconda3Scripts加到Windows 环境变量PATH 中.

================================
安装 selenium python 包

================================

pip install selenium

本文共有好多个下载脚本, 是一个不断完善的过程, 所以, 最后一个下载脚本是最通用, 最完美的.

================================
根据章节序号推算单章url地址, 然后下载

================================

from selenium import webdriver
web = webdriver.Chrome()
full_text="小说:重生之庶女悠然"
full_text=full_text+"
" +"
" +"
"
home_url="https://www.bxwx123.com/novel/gvWcH/"
#web.get('https://www.bxwx123.com/novel/gvWcH/1.html')
chapter_start=1
chapter_end=32
for i in range(chapter_start,chapter_end+1):
    url=home_url+str(i)+".html"
    #print("第"+str(i)+"章")
    full_text=full_text+"
" +"
" +"
" +"======================"+"
"+"第"+str(i)+"章"+ "
"
    web.get(url)
    #'<div id="content" class="showtxt">
    content_tag = web.find_element_by_class_name("showtxt")
    content = content_tag.text
    full_text=full_text+content
print(full_text)
web.close()

from selenium import webdriver
web = webdriver.Chrome()
full_text="小说:嫡女重生-谋嫁世子妃"
full_text=full_text+"
" +"
" +"
"
home_url="https://www.aixswx.com/xs/109/109734/"
#web.get('https://www.aixswx.com/xs/109/109734/223913.html')
chapter_start=361
chapter_end=566  #566
start_page_id=223912
for i in range(chapter_start,chapter_end+1):
    page_id=i+start_page_id
    url=home_url+str(page_id)+".html"
    #print("第"+str(i)+"章")
    full_text=full_text+"
" +"
" +"
" +"======================"+"
"+"第"+str(i)+"章"+ "
"
    web.get(url)
    #<div class="subject_main" id="chapter-content">
    content_tag = web.find_element_by_class_name("subject_main")
    content = content_tag.text
    full_text=full_text+content
print(full_text)
web.close()

from selenium import webdriver
web = webdriver.Chrome()
full_text="小说:嫡女媚"
full_text=full_text+"
" +"
" +"
"
home_url="https://www.bxwx123.com/novel/AEloS3/"
#web.get('https://www.bxwx123.com/novel/AEloS3/1.html')
chapter_start=1
chapter_end=330  #330
start_page_id=0
for i in range(chapter_start,chapter_end+1):
    page_id=i+start_page_id
    url=home_url+str(page_id)+".html"
    #print("第"+str(i)+"章")
    full_text=full_text+"
" +"
" +"
" +"======================"+"
"+"第"+str(i)+"章"+ "
"
    web.get(url)
    #<div class="content">
    content_tag = web.find_element_by_class_name("content")
    content = content_tag.text
    full_text=full_text+content
print(full_text)
web.close()

from selenium import webdriver
web = webdriver.Chrome()
full_text="小说:穿越种田之将门妻"
full_text=full_text+"
" +"
" +"
"
home_url="https://www.jingcaiyuedu6.com/novel/CW8MY3/"
#web.get('https://www.jingcaiyuedu6.com/novel/CW8MY3/1.html')
chapter_start=1
chapter_end=39  #39
start_page_id=0
for i in range(chapter_start,chapter_end+1):
    page_id=i+start_page_id
    url=home_url+str(page_id)+".html"
    #print("第"+str(i)+"章")
    full_text=full_text+"
" +"
" +"
" +"======================"+"
"+"第"+str(i)+"章"+ "
"
    web.get(url)
    #<div id="content">
    content_tag = web.find_element_by_id("content")
    #content_tag = web.find_element_by_class_name("panel panel-default panel-readcontent")
    content = content_tag.text
    full_text=full_text+content
print(full_text)
web.close()

================================

从列表也提取单章url, 然后下载单章文本

================================

#========================================
# 方法1: 数字转中文, 有缺陷,比如: 10将转成一零
#========================================
def num_to_char(num):
    """数字转中文"""
    num=str(num)
    new_str=""
    num_dict={"0":u"零","1":u"一","2":u"二","3":u"三","4":u"四","5":u"五","6":u"六","7":u"七","8":u"八","9":u"九"}
    listnum=list(num)
    # print(listnum)
    shu=[]
    for i in listnum:
        # print(num_dict[i])
        shu.append(num_dict[i])
    new_str="".join(shu)
    # print(new_str)
    return new_str


#========================================
# 方法2: 数字转中文, 比较完美
#========================================
# -------------------------------------------------------------------------------
# Name:         num2chinese
# Author:       yunhgu
# Date:         2021/8/24 14:51
# Description:
# -------------------------------------------------------------------------------

_MAPPING = (u'零', u'一', u'二', u'三', u'四', u'五', u'六', u'七', u'八', u'九',)
_P0 = (u'', u'十', u'百', u'千',)
_S4, _S8, _S16 = 10 ** 4, 10 ** 8, 10 ** 16
_MIN, _MAX = 0, 9999999999999999

class NotIntegerError(Exception):
    pass

class OutOfRangeError(Exception):
    pass

class Num2Chinese:
    def convert(self, number: int):
        """
        :param number:
        :return:chinese number
        """
        return self._to_chinese(number)

    def _to_chinese(self, num):
        if not str(num).isdigit():
            raise NotIntegerError(u'%s is not a integer.' % num)
        if num < _MIN or num > _MAX:
            raise OutOfRangeError(u'%d out of range[%d, %d)' % (num, _MIN, _MAX))
        if num < _S4:
            return self._to_chinese4(num)
        elif num < _S8:
            return self._to_chinese8(num)
        else:
            return self._to_chinese16(num)

    @staticmethod
    def _to_chinese4(num):
        assert (0 <= num < _S4)
        if num < 10:
            return _MAPPING[num]
        else:
            lst = []
            while num >= 10:
                lst.append(num % 10)
                num = num // 10
            lst.append(num)
            c = len(lst)  # 位数
            result = u''
            for idx, val in enumerate(lst):
                if val != 0:
                    result += _P0[idx] + _MAPPING[val]
                    if idx < c - 1 and lst[idx + 1] == 0:
                        result += u'零'
            return result[::-1].replace(u'一十', u'十')

    def _to_chinese8(self, num):
        assert (num < _S8)
        to4 = self._to_chinese4
        if num < _S4:
            return to4(num)
        else:
            mod = _S4
            high, low = num // mod, num % mod
            if low == 0:
                return to4(high) + u'万'
            else:
                if low < _S4 // 10:
                    return to4(high) + u'万零' + to4(low)
                else:
                    return to4(high) + u'万' + to4(low)

    def _to_chinese16(self, num):
        assert (num < _S16)
        to8 = self._to_chinese8
        mod = _S8
        high, low = num // mod, num % mod
        if low == 0:
            return to8(high) + u'亿'
        else:
            if low < _S8 // 10:
                return to8(high) + u'亿零' + to8(low)
            else:
                return to8(high) + u'亿' + to8(low)

#========================================
# 从列表页提取单章url, 然后下载单章文本
#========================================
from selenium import webdriver
web = webdriver.Chrome()
num2chinese = Num2Chinese()
full_text="小说:嫡女媚"
full_text=full_text+"
" +"
" +"
"
list_url="https://qilinchess.com/html/204/204846/"
chapter_start=110
chapter_end=326  #326
for i in range(chapter_start,chapter_end+1):
    chinese_chapter_name="第"+num2chinese.convert(i)+"章"
    if chinese_chapter_name.find("百十"):
        chinese_chapter_name=chinese_chapter_name.replace("百十", "百一十")
    #print(chinese_chapter_name)
    web.get(list_url)   #跳转会列表页, 以便抓取单页的url地址
    url=""
    try:
        url=web.find_element_by_partial_link_text(chinese_chapter_name).get_attribute("href")
    except:
        url=""
    #print(url)
    if url!="":
        web.get(url)
        #<dd id="contents">
        content_tag = web.find_elements_by_css_selector("dd")[2]
        #content_tag = web.find_element_by_id("contents")
        #content_tag = web.find_element_by_class_name("contents")
        content = content_tag.text
    else:
        content="不提供下载"
    chapter_text = "
" + "
" + "
" + "======================" + "
" + "第" + str(i) + "章" + "
"
    chapter_text=chapter_text+content
    print(chapter_text)
    full_text=full_text+chapter_text
#print(full_text)
web.close()

#========================================
# 方法1: 数字转中文, 有缺陷,比如: 10将转成一零
#========================================
def num_to_char(num):
    """数字转中文"""
    num=str(num)
    new_str=""
    num_dict={"0":u"零","1":u"一","2":u"二","3":u"三","4":u"四","5":u"五","6":u"六","7":u"七","8":u"八","9":u"九"}
    listnum=list(num)
    # print(listnum)
    shu=[]
    for i in listnum:
        # print(num_dict[i])
        shu.append(num_dict[i])
    new_str="".join(shu)
    # print(new_str)
    return new_str


#========================================
# 方法2: 数字转中文, 比较完美
#========================================
# -------------------------------------------------------------------------------
# Name:         num2chinese
# Author:       yunhgu
# Date:         2021/8/24 14:51
# Description:
# -------------------------------------------------------------------------------

_MAPPING = (u'零', u'一', u'二', u'三', u'四', u'五', u'六', u'七', u'八', u'九',)
_P0 = (u'', u'十', u'百', u'千',)
_S4, _S8, _S16 = 10 ** 4, 10 ** 8, 10 ** 16
_MIN, _MAX = 0, 9999999999999999

class NotIntegerError(Exception):
    pass

class OutOfRangeError(Exception):
    pass

class Num2Chinese:
    def convert(self, number: int):
        """
        :param number:
        :return:chinese number
        """
        return self._to_chinese(number)

    def _to_chinese(self, num):
        if not str(num).isdigit():
            raise NotIntegerError(u'%s is not a integer.' % num)
        if num < _MIN or num > _MAX:
            raise OutOfRangeError(u'%d out of range[%d, %d)' % (num, _MIN, _MAX))
        if num < _S4:
            return self._to_chinese4(num)
        elif num < _S8:
            return self._to_chinese8(num)
        else:
            return self._to_chinese16(num)

    @staticmethod
    def _to_chinese4(num):
        assert (0 <= num < _S4)
        if num < 10:
            return _MAPPING[num]
        else:
            lst = []
            while num >= 10:
                lst.append(num % 10)
                num = num // 10
            lst.append(num)
            c = len(lst)  # 位数
            result = u''
            for idx, val in enumerate(lst):
                if val != 0:
                    result += _P0[idx] + _MAPPING[val]
                    if idx < c - 1 and lst[idx + 1] == 0:
                        result += u'零'
            return result[::-1].replace(u'一十', u'十')

    def _to_chinese8(self, num):
        assert (num < _S8)
        to4 = self._to_chinese4
        if num < _S4:
            return to4(num)
        else:
            mod = _S4
            high, low = num // mod, num % mod
            if low == 0:
                return to4(high) + u'万'
            else:
                if low < _S4 // 10:
                    return to4(high) + u'万零' + to4(low)
                else:
                    return to4(high) + u'万' + to4(low)

    def _to_chinese16(self, num):
        assert (num < _S16)
        to8 = self._to_chinese8
        mod = _S8
        high, low = num // mod, num % mod
        if low == 0:
            return to8(high) + u'亿'
        else:
            if low < _S8 // 10:
                return to8(high) + u'亿零' + to8(low)
            else:
                return to8(high) + u'亿' + to8(low)

#========================================
# 从列表页提取单章url, 然后下载单章文本
#========================================
from selenium import webdriver
web = webdriver.Chrome()
num2chinese = Num2Chinese()
full_text="小说:掌家小娘子"
full_text=full_text+"
" +"
" +"
"
print(full_text)
list_url="https://www.baihexs.com/0/54/"
chapter_start=1
chapter_end=306  #306
for i in range(chapter_start,chapter_end+1):
    chinese_chapter_id=num2chinese.convert(i)   #中文数字
    #chinese_chapter_id=str(i)    #阿拉伯数字
    chinese_chapter_name="第"+chinese_chapter_id+"章"
    if chinese_chapter_name.find("百十"):
        chinese_chapter_name=chinese_chapter_name.replace("百十", "百一十")
    #print(chinese_chapter_name)
    web.get(list_url)   #跳转会列表页, 以便抓取单页的url地址
    url=""
    try:
        url=web.find_element_by_partial_link_text(chinese_chapter_name).get_attribute("href")
    except:
        url=""
    #print(url)
    if  url:
        web.get(url)
        #<dd id="contents">
        #//*[@id="content"]
        #content_tag = web.find_elements_by_css_selector("dd")[2]
        #content_tag = web.find_element_by_id("contents")
        #content_tag = web.find_element_by_class_name("container body-content")
        content_tag = web.find_element_by_xpath('''//*[@id="center"]''')
        content = content_tag.text
    else:
        content="不提供下载"
    chapter_text = "
" + "
" + "
" + "======================" + "
" + "第" + str(i) + "章" + "
"
    chapter_text=chapter_text+content
    print(chapter_text)
    full_text=full_text+chapter_text
#print(full_text)
web.close()

================================
每章支持多个分页

================================

#========================================
# 方法1: 数字转中文, 有缺陷,比如: 10将转成一零
#========================================
def num_to_char(num):
    """数字转中文"""
    num=str(num)
    new_str=""
    num_dict={"0":u"零","1":u"一","2":u"二","3":u"三","4":u"四","5":u"五","6":u"六","7":u"七","8":u"八","9":u"九"}
    listnum=list(num)
    # print(listnum)
    shu=[]
    for i in listnum:
        # print(num_dict[i])
        shu.append(num_dict[i])
    new_str="".join(shu)
    # print(new_str)
    return new_str


#========================================
# 方法2: 数字转中文, 比较完美
#========================================
# -------------------------------------------------------------------------------
# Name:         num2chinese
# Author:       yunhgu
# Date:         2021/8/24 14:51
# Description:
# -------------------------------------------------------------------------------

_MAPPING = (u'零', u'一', u'二', u'三', u'四', u'五', u'六', u'七', u'八', u'九',)
_P0 = (u'', u'十', u'百', u'千',)
_S4, _S8, _S16 = 10 ** 4, 10 ** 8, 10 ** 16
_MIN, _MAX = 0, 9999999999999999

class NotIntegerError(Exception):
    pass

class OutOfRangeError(Exception):
    pass

class Num2Chinese:
    def convert(self, number: int):
        """
        :param number:
        :return:chinese number
        """
        return self._to_chinese(number)

    def _to_chinese(self, num):
        if not str(num).isdigit():
            raise NotIntegerError(u'%s is not a integer.' % num)
        if num < _MIN or num > _MAX:
            raise OutOfRangeError(u'%d out of range[%d, %d)' % (num, _MIN, _MAX))
        if num < _S4:
            return self._to_chinese4(num)
        elif num < _S8:
            return self._to_chinese8(num)
        else:
            return self._to_chinese16(num)

    @staticmethod
    def _to_chinese4(num):
        assert (0 <= num < _S4)
        if num < 10:
            return _MAPPING[num]
        else:
            lst = []
            while num >= 10:
                lst.append(num % 10)
                num = num // 10
            lst.append(num)
            c = len(lst)  # 位数
            result = u''
            for idx, val in enumerate(lst):
                if val != 0:
                    result += _P0[idx] + _MAPPING[val]
                    if idx < c - 1 and lst[idx + 1] == 0:
                        result += u'零'
            return result[::-1].replace(u'一十', u'十')

    def _to_chinese8(self, num):
        assert (num < _S8)
        to4 = self._to_chinese4
        if num < _S4:
            return to4(num)
        else:
            mod = _S4
            high, low = num // mod, num % mod
            if low == 0:
                return to4(high) + u'万'
            else:
                if low < _S4 // 10:
                    return to4(high) + u'万零' + to4(low)
                else:
                    return to4(high) + u'万' + to4(low)

    def _to_chinese16(self, num):
        assert (num < _S16)
        to8 = self._to_chinese8
        mod = _S8
        high, low = num // mod, num % mod
        if low == 0:
            return to8(high) + u'亿'
        else:
            if low < _S8 // 10:
                return to8(high) + u'亿零' + to8(low)
            else:
                return to8(high) + u'亿' + to8(low)


def get_sub_page_url(chapter_url, sub_page_count, first_sub_index, sub_page_id):
    """
    get sub_page url
    :param chapter_url: chapter url
    :param sub_page_count: total sub_page count of every chapter
    :param first_sub_index: first sub_page index with prefix _
    :param sub_page_id:
    :return:
    """
    if sub_page_count==0:
        return chapter_url
    else:
        if sub_page_id<first_sub_index:
            return chapter_url
        else:
            #https://www.mht99.com/98886/82000964.html
            # https://www.mht99.com/98886/82000964_1.html
            return chapter_url.replace(".html","_"+str(sub_page_id)+".html")


#========================================
# 从列表页提取单章url, 然后下载单章文本
#========================================
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
web = webdriver.Chrome()
num2chinese = Num2Chinese()


full_text="小说:秀才家的冲喜娘子"
full_text=full_text+"
" +"
" +"
"
print(full_text)
list_url="https://www.mht99.com/98886/"

sub_page_count=4    #每章的页数
first_sub_index=1   #第一个子页url中的下标
chapter_start=1
chapter_end=480  #480

for i in range(chapter_start,chapter_end+1):

    # get chapter name
    #chinese_chapter_id=num2chinese.convert(i)   #中文数字
    chinese_chapter_id=str(i)    #阿拉伯数字
    chinese_chapter_name="第"+chinese_chapter_id+"章"
    if chinese_chapter_name.find("百十"):
        chinese_chapter_name=chinese_chapter_name.replace("百十", "百一十")
    #print(chinese_chapter_name)

    # 跳转到列表页, 以便抓取单页的url地址
    web.get(list_url)

    #从列表也中获取每章的 url
    chapter_url=""
    try:
        chapter_url=web.find_element_by_partial_link_text(chinese_chapter_name).get_attribute("href")
    except:
        chapter_url=""

    #download 每章内容
    if chapter_url:
        chapter_content=""

        #下载每一章的分页内容
        for j in range(sub_page_count):
            try:
                sub_page_id = j+1
                sub_page_url=get_sub_page_url(chapter_url, sub_page_count, first_sub_index,sub_page_id)
                #print("####第"+ str(sub_page_id)+":" +sub_page_url)
                web.get(sub_page_url)
                #<dd id="contents">
                #//*[@id="content"]
                #content_tag = web.find_elements_by_css_selector("dd")[2]
                #content_tag = web.find_element_by_id("contents")
                #content_tag = web.find_element_by_class_name("container body-content")
                content_tag = web.find_element_by_xpath('''//*[@id="content"]''')
                chapter_content = chapter_content+ "
" + "
"
                chapter_content = chapter_content + content_tag.text
            except NoSuchElementException:
                print("####第"+ str(sub_page_id)+"页:" +"下载失败")
    else:
        page_content="不提供下载"

    #output
    page_title_text = "
" + "
" + "
" + "======================" + "
" + "第" + str(i) + "章" + "
"
    chapter_full_text=page_title_text+chapter_content
    print(chapter_full_text)
    full_text=full_text+chapter_full_text
#print(full_text)
web.close()

================================
selenium 的更多信息

================================

selenium 不仅支持Python, 还支持Java/C#

https://www.selenium.dev/documentation/zh-cn/webdriver/driver_requirements/
https://www.selenium.dev/documentation/zh-cn/selenium_installation/installing_webdriver_binaries/

查看全文

相关阅读:
synchronized锁升级的过程（偏向锁到轻量锁再到重量级锁）转
 sprin 事务注解@Transactional的实现原理（转）
springboot + redis + 注解 + 拦截器实现接口幂等性校验(转)
JAVA-TCP
oracle查看连接信息
 C# 计算两个字符的相似度
 Java设计模式桥接模式
 C# Newtonsoft.Json.JsonReaderException:“Could not convert string to decimal:
java设计模式结构型模式
 Java设计模式原型模式

原文地址：https://www.cnblogs.com/harrychinese/p/novel_download.html