zoukankan      html  css  js  c++  java
  • 小说下载脚本

    ================================

    工具准备:

    ================================
    下载与 chome 浏览器版本一致的 chromedriver, chromedriver 国内下载镜像
    https://npm.taobao.org/mirrors/chromedriver

    将 chromedriver.exe 复制到 python 的scripts目录中, 比如 C:Anaconda3Scripts
    并将C:Anaconda3Scripts加到Windows 环境变量PATH 中.  


    ================================
    安装 selenium python 包

    ================================

    pip install selenium

    本文共有好多个下载脚本, 是一个不断完善的过程,  所以, 最后一个下载脚本是最通用, 最完美的.

    ================================
    根据章节序号推算单章url地址, 然后下载

    ================================

    from selenium import webdriver
    web = webdriver.Chrome()
    full_text="小说:重生之庶女悠然"
    full_text=full_text+"
    " +"
    " +"
    "
    home_url="https://www.bxwx123.com/novel/gvWcH/"
    #web.get('https://www.bxwx123.com/novel/gvWcH/1.html')
    chapter_start=1
    chapter_end=32
    for i in range(chapter_start,chapter_end+1):
        url=home_url+str(i)+".html"
        #print("第"+str(i)+"章")
        full_text=full_text+"
    " +"
    " +"
    " +"======================"+"
    "+""+str(i)+""+ "
    "
        web.get(url)
        #'<div id="content" class="showtxt">
        content_tag = web.find_element_by_class_name("showtxt")
        content = content_tag.text
        full_text=full_text+content
    print(full_text)
    web.close()
    from selenium import webdriver
    web = webdriver.Chrome()
    full_text="小说:嫡女重生-谋嫁世子妃"
    full_text=full_text+"
    " +"
    " +"
    "
    home_url="https://www.aixswx.com/xs/109/109734/"
    #web.get('https://www.aixswx.com/xs/109/109734/223913.html')
    chapter_start=361
    chapter_end=566  #566
    start_page_id=223912
    for i in range(chapter_start,chapter_end+1):
        page_id=i+start_page_id
        url=home_url+str(page_id)+".html"
        #print("第"+str(i)+"章")
        full_text=full_text+"
    " +"
    " +"
    " +"======================"+"
    "+""+str(i)+""+ "
    "
        web.get(url)
        #<div class="subject_main" id="chapter-content">
        content_tag = web.find_element_by_class_name("subject_main")
        content = content_tag.text
        full_text=full_text+content
    print(full_text)
    web.close()
    from selenium import webdriver
    web = webdriver.Chrome()
    full_text="小说:嫡女媚"
    full_text=full_text+"
    " +"
    " +"
    "
    home_url="https://www.bxwx123.com/novel/AEloS3/"
    #web.get('https://www.bxwx123.com/novel/AEloS3/1.html')
    chapter_start=1
    chapter_end=330  #330
    start_page_id=0
    for i in range(chapter_start,chapter_end+1):
        page_id=i+start_page_id
        url=home_url+str(page_id)+".html"
        #print("第"+str(i)+"章")
        full_text=full_text+"
    " +"
    " +"
    " +"======================"+"
    "+""+str(i)+""+ "
    "
        web.get(url)
        #<div class="content">
        content_tag = web.find_element_by_class_name("content")
        content = content_tag.text
        full_text=full_text+content
    print(full_text)
    web.close()
    from selenium import webdriver
    web = webdriver.Chrome()
    full_text="小说:穿越种田之将门妻"
    full_text=full_text+"
    " +"
    " +"
    "
    home_url="https://www.jingcaiyuedu6.com/novel/CW8MY3/"
    #web.get('https://www.jingcaiyuedu6.com/novel/CW8MY3/1.html')
    chapter_start=1
    chapter_end=39  #39
    start_page_id=0
    for i in range(chapter_start,chapter_end+1):
        page_id=i+start_page_id
        url=home_url+str(page_id)+".html"
        #print("第"+str(i)+"章")
        full_text=full_text+"
    " +"
    " +"
    " +"======================"+"
    "+""+str(i)+""+ "
    "
        web.get(url)
        #<div id="content">
        content_tag = web.find_element_by_id("content")
        #content_tag = web.find_element_by_class_name("panel panel-default panel-readcontent")
        content = content_tag.text
        full_text=full_text+content
    print(full_text)
    web.close()

    ================================

    从列表也提取单章url, 然后下载单章文本

    ================================

    #========================================
    # 方法1: 数字转中文, 有缺陷,比如: 10将转成一零
    #========================================
    def num_to_char(num):
        """数字转中文"""
        num=str(num)
        new_str=""
        num_dict={"0":u"","1":u"","2":u"","3":u"","4":u"","5":u"","6":u"","7":u"","8":u"","9":u""}
        listnum=list(num)
        # print(listnum)
        shu=[]
        for i in listnum:
            # print(num_dict[i])
            shu.append(num_dict[i])
        new_str="".join(shu)
        # print(new_str)
        return new_str
    
    
    #========================================
    # 方法2: 数字转中文, 比较完美
    #========================================
    # -------------------------------------------------------------------------------
    # Name:         num2chinese
    # Author:       yunhgu
    # Date:         2021/8/24 14:51
    # Description:
    # -------------------------------------------------------------------------------
    
    _MAPPING = (u'', u'', u'', u'', u'', u'', u'', u'', u'', u'',)
    _P0 = (u'', u'', u'', u'',)
    _S4, _S8, _S16 = 10 ** 4, 10 ** 8, 10 ** 16
    _MIN, _MAX = 0, 9999999999999999
    
    class NotIntegerError(Exception):
        pass
    
    class OutOfRangeError(Exception):
        pass
    
    class Num2Chinese:
        def convert(self, number: int):
            """
            :param number:
            :return:chinese number
            """
            return self._to_chinese(number)
    
        def _to_chinese(self, num):
            if not str(num).isdigit():
                raise NotIntegerError(u'%s is not a integer.' % num)
            if num < _MIN or num > _MAX:
                raise OutOfRangeError(u'%d out of range[%d, %d)' % (num, _MIN, _MAX))
            if num < _S4:
                return self._to_chinese4(num)
            elif num < _S8:
                return self._to_chinese8(num)
            else:
                return self._to_chinese16(num)
    
        @staticmethod
        def _to_chinese4(num):
            assert (0 <= num < _S4)
            if num < 10:
                return _MAPPING[num]
            else:
                lst = []
                while num >= 10:
                    lst.append(num % 10)
                    num = num // 10
                lst.append(num)
                c = len(lst)  # 位数
                result = u''
                for idx, val in enumerate(lst):
                    if val != 0:
                        result += _P0[idx] + _MAPPING[val]
                        if idx < c - 1 and lst[idx + 1] == 0:
                            result += u''
                return result[::-1].replace(u'一十', u'')
    
        def _to_chinese8(self, num):
            assert (num < _S8)
            to4 = self._to_chinese4
            if num < _S4:
                return to4(num)
            else:
                mod = _S4
                high, low = num // mod, num % mod
                if low == 0:
                    return to4(high) + u''
                else:
                    if low < _S4 // 10:
                        return to4(high) + u'万零' + to4(low)
                    else:
                        return to4(high) + u'' + to4(low)
    
        def _to_chinese16(self, num):
            assert (num < _S16)
            to8 = self._to_chinese8
            mod = _S8
            high, low = num // mod, num % mod
            if low == 0:
                return to8(high) + u'亿'
            else:
                if low < _S8 // 10:
                    return to8(high) + u'亿零' + to8(low)
                else:
                    return to8(high) + u'亿' + to8(low)
    
    #========================================
    # 从列表页提取单章url, 然后下载单章文本
    #========================================
    from selenium import webdriver
    web = webdriver.Chrome()
    num2chinese = Num2Chinese()
    full_text="小说:嫡女媚"
    full_text=full_text+"
    " +"
    " +"
    "
    list_url="https://qilinchess.com/html/204/204846/"
    chapter_start=110
    chapter_end=326  #326
    for i in range(chapter_start,chapter_end+1):
        chinese_chapter_name=""+num2chinese.convert(i)+""
        if chinese_chapter_name.find("百十"):
            chinese_chapter_name=chinese_chapter_name.replace("百十", "百一十")
        #print(chinese_chapter_name)
        web.get(list_url)   #跳转会列表页, 以便抓取单页的url地址
        url=""
        try:
            url=web.find_element_by_partial_link_text(chinese_chapter_name).get_attribute("href")
        except:
            url=""
        #print(url)
        if url!="":
            web.get(url)
            #<dd id="contents">
            content_tag = web.find_elements_by_css_selector("dd")[2]
            #content_tag = web.find_element_by_id("contents")
            #content_tag = web.find_element_by_class_name("contents")
            content = content_tag.text
        else:
            content="不提供下载"
        chapter_text = "
    " + "
    " + "
    " + "======================" + "
    " + "" + str(i) + "" + "
    "
        chapter_text=chapter_text+content
        print(chapter_text)
        full_text=full_text+chapter_text
    #print(full_text)
    web.close()
    #========================================
    # 方法1: 数字转中文, 有缺陷,比如: 10将转成一零
    #========================================
    def num_to_char(num):
        """数字转中文"""
        num=str(num)
        new_str=""
        num_dict={"0":u"","1":u"","2":u"","3":u"","4":u"","5":u"","6":u"","7":u"","8":u"","9":u""}
        listnum=list(num)
        # print(listnum)
        shu=[]
        for i in listnum:
            # print(num_dict[i])
            shu.append(num_dict[i])
        new_str="".join(shu)
        # print(new_str)
        return new_str
    
    
    #========================================
    # 方法2: 数字转中文, 比较完美
    #========================================
    # -------------------------------------------------------------------------------
    # Name:         num2chinese
    # Author:       yunhgu
    # Date:         2021/8/24 14:51
    # Description:
    # -------------------------------------------------------------------------------
    
    _MAPPING = (u'', u'', u'', u'', u'', u'', u'', u'', u'', u'',)
    _P0 = (u'', u'', u'', u'',)
    _S4, _S8, _S16 = 10 ** 4, 10 ** 8, 10 ** 16
    _MIN, _MAX = 0, 9999999999999999
    
    class NotIntegerError(Exception):
        pass
    
    class OutOfRangeError(Exception):
        pass
    
    class Num2Chinese:
        def convert(self, number: int):
            """
            :param number:
            :return:chinese number
            """
            return self._to_chinese(number)
    
        def _to_chinese(self, num):
            if not str(num).isdigit():
                raise NotIntegerError(u'%s is not a integer.' % num)
            if num < _MIN or num > _MAX:
                raise OutOfRangeError(u'%d out of range[%d, %d)' % (num, _MIN, _MAX))
            if num < _S4:
                return self._to_chinese4(num)
            elif num < _S8:
                return self._to_chinese8(num)
            else:
                return self._to_chinese16(num)
    
        @staticmethod
        def _to_chinese4(num):
            assert (0 <= num < _S4)
            if num < 10:
                return _MAPPING[num]
            else:
                lst = []
                while num >= 10:
                    lst.append(num % 10)
                    num = num // 10
                lst.append(num)
                c = len(lst)  # 位数
                result = u''
                for idx, val in enumerate(lst):
                    if val != 0:
                        result += _P0[idx] + _MAPPING[val]
                        if idx < c - 1 and lst[idx + 1] == 0:
                            result += u''
                return result[::-1].replace(u'一十', u'')
    
        def _to_chinese8(self, num):
            assert (num < _S8)
            to4 = self._to_chinese4
            if num < _S4:
                return to4(num)
            else:
                mod = _S4
                high, low = num // mod, num % mod
                if low == 0:
                    return to4(high) + u''
                else:
                    if low < _S4 // 10:
                        return to4(high) + u'万零' + to4(low)
                    else:
                        return to4(high) + u'' + to4(low)
    
        def _to_chinese16(self, num):
            assert (num < _S16)
            to8 = self._to_chinese8
            mod = _S8
            high, low = num // mod, num % mod
            if low == 0:
                return to8(high) + u'亿'
            else:
                if low < _S8 // 10:
                    return to8(high) + u'亿零' + to8(low)
                else:
                    return to8(high) + u'亿' + to8(low)
    
    #========================================
    # 从列表页提取单章url, 然后下载单章文本
    #========================================
    from selenium import webdriver
    web = webdriver.Chrome()
    num2chinese = Num2Chinese()
    full_text="小说:掌家小娘子"
    full_text=full_text+"
    " +"
    " +"
    "
    print(full_text)
    list_url="https://www.baihexs.com/0/54/"
    chapter_start=1
    chapter_end=306  #306
    for i in range(chapter_start,chapter_end+1):
        chinese_chapter_id=num2chinese.convert(i)   #中文数字
        #chinese_chapter_id=str(i)    #阿拉伯数字
        chinese_chapter_name=""+chinese_chapter_id+""
        if chinese_chapter_name.find("百十"):
            chinese_chapter_name=chinese_chapter_name.replace("百十", "百一十")
        #print(chinese_chapter_name)
        web.get(list_url)   #跳转会列表页, 以便抓取单页的url地址
        url=""
        try:
            url=web.find_element_by_partial_link_text(chinese_chapter_name).get_attribute("href")
        except:
            url=""
        #print(url)
        if  url:
            web.get(url)
            #<dd id="contents">
            #//*[@id="content"]
            #content_tag = web.find_elements_by_css_selector("dd")[2]
            #content_tag = web.find_element_by_id("contents")
            #content_tag = web.find_element_by_class_name("container body-content")
            content_tag = web.find_element_by_xpath('''//*[@id="center"]''')
            content = content_tag.text
        else:
            content="不提供下载"
        chapter_text = "
    " + "
    " + "
    " + "======================" + "
    " + "" + str(i) + "" + "
    "
        chapter_text=chapter_text+content
        print(chapter_text)
        full_text=full_text+chapter_text
    #print(full_text)
    web.close()

    ================================
    每章支持多个分页

    ================================

    #========================================
    # 方法1: 数字转中文, 有缺陷,比如: 10将转成一零
    #========================================
    def num_to_char(num):
        """数字转中文"""
        num=str(num)
        new_str=""
        num_dict={"0":u"","1":u"","2":u"","3":u"","4":u"","5":u"","6":u"","7":u"","8":u"","9":u""}
        listnum=list(num)
        # print(listnum)
        shu=[]
        for i in listnum:
            # print(num_dict[i])
            shu.append(num_dict[i])
        new_str="".join(shu)
        # print(new_str)
        return new_str
    
    
    #========================================
    # 方法2: 数字转中文, 比较完美
    #========================================
    # -------------------------------------------------------------------------------
    # Name:         num2chinese
    # Author:       yunhgu
    # Date:         2021/8/24 14:51
    # Description:
    # -------------------------------------------------------------------------------
    
    _MAPPING = (u'', u'', u'', u'', u'', u'', u'', u'', u'', u'',)
    _P0 = (u'', u'', u'', u'',)
    _S4, _S8, _S16 = 10 ** 4, 10 ** 8, 10 ** 16
    _MIN, _MAX = 0, 9999999999999999
    
    class NotIntegerError(Exception):
        pass
    
    class OutOfRangeError(Exception):
        pass
    
    class Num2Chinese:
        def convert(self, number: int):
            """
            :param number:
            :return:chinese number
            """
            return self._to_chinese(number)
    
        def _to_chinese(self, num):
            if not str(num).isdigit():
                raise NotIntegerError(u'%s is not a integer.' % num)
            if num < _MIN or num > _MAX:
                raise OutOfRangeError(u'%d out of range[%d, %d)' % (num, _MIN, _MAX))
            if num < _S4:
                return self._to_chinese4(num)
            elif num < _S8:
                return self._to_chinese8(num)
            else:
                return self._to_chinese16(num)
    
        @staticmethod
        def _to_chinese4(num):
            assert (0 <= num < _S4)
            if num < 10:
                return _MAPPING[num]
            else:
                lst = []
                while num >= 10:
                    lst.append(num % 10)
                    num = num // 10
                lst.append(num)
                c = len(lst)  # 位数
                result = u''
                for idx, val in enumerate(lst):
                    if val != 0:
                        result += _P0[idx] + _MAPPING[val]
                        if idx < c - 1 and lst[idx + 1] == 0:
                            result += u''
                return result[::-1].replace(u'一十', u'')
    
        def _to_chinese8(self, num):
            assert (num < _S8)
            to4 = self._to_chinese4
            if num < _S4:
                return to4(num)
            else:
                mod = _S4
                high, low = num // mod, num % mod
                if low == 0:
                    return to4(high) + u''
                else:
                    if low < _S4 // 10:
                        return to4(high) + u'万零' + to4(low)
                    else:
                        return to4(high) + u'' + to4(low)
    
        def _to_chinese16(self, num):
            assert (num < _S16)
            to8 = self._to_chinese8
            mod = _S8
            high, low = num // mod, num % mod
            if low == 0:
                return to8(high) + u'亿'
            else:
                if low < _S8 // 10:
                    return to8(high) + u'亿零' + to8(low)
                else:
                    return to8(high) + u'亿' + to8(low)
    
    
    def get_sub_page_url(chapter_url, sub_page_count, first_sub_index, sub_page_id):
        """
        get sub_page url
        :param chapter_url: chapter url
        :param sub_page_count: total sub_page count of every chapter
        :param first_sub_index: first sub_page index with prefix _
        :param sub_page_id:
        :return:
        """
        if sub_page_count==0:
            return chapter_url
        else:
            if sub_page_id<first_sub_index:
                return chapter_url
            else:
                #https://www.mht99.com/98886/82000964.html
                # https://www.mht99.com/98886/82000964_1.html
                return chapter_url.replace(".html","_"+str(sub_page_id)+".html")
    
    
    #========================================
    # 从列表页提取单章url, 然后下载单章文本
    #========================================
    from selenium import webdriver
    from selenium.common.exceptions import NoSuchElementException
    web = webdriver.Chrome()
    num2chinese = Num2Chinese()
    
    
    full_text="小说:秀才家的冲喜娘子"
    full_text=full_text+"
    " +"
    " +"
    "
    print(full_text)
    list_url="https://www.mht99.com/98886/"
    
    sub_page_count=4    #每章的页数
    first_sub_index=1   #第一个子页url中的下标
    chapter_start=1
    chapter_end=480  #480
    
    for i in range(chapter_start,chapter_end+1):
    
        # get chapter name
        #chinese_chapter_id=num2chinese.convert(i)   #中文数字
        chinese_chapter_id=str(i)    #阿拉伯数字
        chinese_chapter_name=""+chinese_chapter_id+""
        if chinese_chapter_name.find("百十"):
            chinese_chapter_name=chinese_chapter_name.replace("百十", "百一十")
        #print(chinese_chapter_name)
    
        # 跳转到列表页, 以便抓取单页的url地址
        web.get(list_url)
    
        #从列表也中获取每章的 url
        chapter_url=""
        try:
            chapter_url=web.find_element_by_partial_link_text(chinese_chapter_name).get_attribute("href")
        except:
            chapter_url=""
    
        #download 每章内容
        if chapter_url:
            chapter_content=""
    
            #下载每一章的分页内容
            for j in range(sub_page_count):
                try:
                    sub_page_id = j+1
                    sub_page_url=get_sub_page_url(chapter_url, sub_page_count, first_sub_index,sub_page_id)
                    #print("####第"+ str(sub_page_id)+":" +sub_page_url)
                    web.get(sub_page_url)
                    #<dd id="contents">
                    #//*[@id="content"]
                    #content_tag = web.find_elements_by_css_selector("dd")[2]
                    #content_tag = web.find_element_by_id("contents")
                    #content_tag = web.find_element_by_class_name("container body-content")
                    content_tag = web.find_element_by_xpath('''//*[@id="content"]''')
                    chapter_content = chapter_content+ "
    " + "
    "
                    chapter_content = chapter_content + content_tag.text
                except NoSuchElementException:
                    print("####第"+ str(sub_page_id)+"页:" +"下载失败")
        else:
            page_content="不提供下载"
    
        #output
        page_title_text = "
    " + "
    " + "
    " + "======================" + "
    " + "" + str(i) + "" + "
    "
        chapter_full_text=page_title_text+chapter_content
        print(chapter_full_text)
        full_text=full_text+chapter_full_text
    #print(full_text)
    web.close()

    ================================
    selenium  的更多信息

    ================================

    selenium 不仅支持Python, 还支持Java/C#

    https://www.selenium.dev/documentation/zh-cn/webdriver/driver_requirements/
    https://www.selenium.dev/documentation/zh-cn/selenium_installation/installing_webdriver_binaries/

  • 相关阅读:
    需求变更的种类及应对方式
    SQL Server中连接远程表、查询其它服务器的数据、导入或导出到其它Sql Server服务器数据
    在IE9中MSWC.BrowserType组件无法识别Cookie的问题
    优秀软件的几个重要标准
    对待代码的态度反应着对待自己的态度
    应对企业不断变化的系统
    在SQL中插入®特殊字符
    如何让领导认识到测试的重要性,在沟通时要注意的几点
    男人要补肾,强肾健脑对能持久做程序
    你可能不知道的Visual Studio 2010使用技巧(VS2010的秘密)
  • 原文地址:https://www.cnblogs.com/harrychinese/p/novel_download.html
Copyright © 2011-2022 走看看