================================
工具准备:
================================
下载与 chome 浏览器版本一致的 chromedriver, chromedriver 国内下载镜像
https://npm.taobao.org/mirrors/chromedriver
将 chromedriver.exe 复制到 python 的scripts目录中, 比如 C:Anaconda3Scripts
并将C:Anaconda3Scripts加到Windows 环境变量PATH 中.
================================
安装 selenium python 包
================================
pip install selenium
本文共有好多个下载脚本, 是一个不断完善的过程, 所以, 最后一个下载脚本是最通用, 最完美的.
================================
根据章节序号推算单章url地址, 然后下载
================================
from selenium import webdriver web = webdriver.Chrome() full_text="小说:重生之庶女悠然" full_text=full_text+" " +" " +" " home_url="https://www.bxwx123.com/novel/gvWcH/" #web.get('https://www.bxwx123.com/novel/gvWcH/1.html') chapter_start=1 chapter_end=32 for i in range(chapter_start,chapter_end+1): url=home_url+str(i)+".html" #print("第"+str(i)+"章") full_text=full_text+" " +" " +" " +"======================"+" "+"第"+str(i)+"章"+ " " web.get(url) #'<div id="content" class="showtxt"> content_tag = web.find_element_by_class_name("showtxt") content = content_tag.text full_text=full_text+content print(full_text) web.close()
from selenium import webdriver web = webdriver.Chrome() full_text="小说:嫡女重生-谋嫁世子妃" full_text=full_text+" " +" " +" " home_url="https://www.aixswx.com/xs/109/109734/" #web.get('https://www.aixswx.com/xs/109/109734/223913.html') chapter_start=361 chapter_end=566 #566 start_page_id=223912 for i in range(chapter_start,chapter_end+1): page_id=i+start_page_id url=home_url+str(page_id)+".html" #print("第"+str(i)+"章") full_text=full_text+" " +" " +" " +"======================"+" "+"第"+str(i)+"章"+ " " web.get(url) #<div class="subject_main" id="chapter-content"> content_tag = web.find_element_by_class_name("subject_main") content = content_tag.text full_text=full_text+content print(full_text) web.close()
from selenium import webdriver web = webdriver.Chrome() full_text="小说:嫡女媚" full_text=full_text+" " +" " +" " home_url="https://www.bxwx123.com/novel/AEloS3/" #web.get('https://www.bxwx123.com/novel/AEloS3/1.html') chapter_start=1 chapter_end=330 #330 start_page_id=0 for i in range(chapter_start,chapter_end+1): page_id=i+start_page_id url=home_url+str(page_id)+".html" #print("第"+str(i)+"章") full_text=full_text+" " +" " +" " +"======================"+" "+"第"+str(i)+"章"+ " " web.get(url) #<div class="content"> content_tag = web.find_element_by_class_name("content") content = content_tag.text full_text=full_text+content print(full_text) web.close()
from selenium import webdriver web = webdriver.Chrome() full_text="小说:穿越种田之将门妻" full_text=full_text+" " +" " +" " home_url="https://www.jingcaiyuedu6.com/novel/CW8MY3/" #web.get('https://www.jingcaiyuedu6.com/novel/CW8MY3/1.html') chapter_start=1 chapter_end=39 #39 start_page_id=0 for i in range(chapter_start,chapter_end+1): page_id=i+start_page_id url=home_url+str(page_id)+".html" #print("第"+str(i)+"章") full_text=full_text+" " +" " +" " +"======================"+" "+"第"+str(i)+"章"+ " " web.get(url) #<div id="content"> content_tag = web.find_element_by_id("content") #content_tag = web.find_element_by_class_name("panel panel-default panel-readcontent") content = content_tag.text full_text=full_text+content print(full_text) web.close()
================================
从列表也提取单章url, 然后下载单章文本
================================
#======================================== # 方法1: 数字转中文, 有缺陷,比如: 10将转成一零 #======================================== def num_to_char(num): """数字转中文""" num=str(num) new_str="" num_dict={"0":u"零","1":u"一","2":u"二","3":u"三","4":u"四","5":u"五","6":u"六","7":u"七","8":u"八","9":u"九"} listnum=list(num) # print(listnum) shu=[] for i in listnum: # print(num_dict[i]) shu.append(num_dict[i]) new_str="".join(shu) # print(new_str) return new_str #======================================== # 方法2: 数字转中文, 比较完美 #======================================== # ------------------------------------------------------------------------------- # Name: num2chinese # Author: yunhgu # Date: 2021/8/24 14:51 # Description: # ------------------------------------------------------------------------------- _MAPPING = (u'零', u'一', u'二', u'三', u'四', u'五', u'六', u'七', u'八', u'九',) _P0 = (u'', u'十', u'百', u'千',) _S4, _S8, _S16 = 10 ** 4, 10 ** 8, 10 ** 16 _MIN, _MAX = 0, 9999999999999999 class NotIntegerError(Exception): pass class OutOfRangeError(Exception): pass class Num2Chinese: def convert(self, number: int): """ :param number: :return:chinese number """ return self._to_chinese(number) def _to_chinese(self, num): if not str(num).isdigit(): raise NotIntegerError(u'%s is not a integer.' % num) if num < _MIN or num > _MAX: raise OutOfRangeError(u'%d out of range[%d, %d)' % (num, _MIN, _MAX)) if num < _S4: return self._to_chinese4(num) elif num < _S8: return self._to_chinese8(num) else: return self._to_chinese16(num) @staticmethod def _to_chinese4(num): assert (0 <= num < _S4) if num < 10: return _MAPPING[num] else: lst = [] while num >= 10: lst.append(num % 10) num = num // 10 lst.append(num) c = len(lst) # 位数 result = u'' for idx, val in enumerate(lst): if val != 0: result += _P0[idx] + _MAPPING[val] if idx < c - 1 and lst[idx + 1] == 0: result += u'零' return result[::-1].replace(u'一十', u'十') def _to_chinese8(self, num): assert (num < _S8) to4 = self._to_chinese4 if num < _S4: return to4(num) else: mod = _S4 high, low = num // mod, num % mod if low == 0: return to4(high) + u'万' else: if low < _S4 // 10: return to4(high) + u'万零' + to4(low) else: return to4(high) + u'万' + to4(low) def _to_chinese16(self, num): assert (num < _S16) to8 = self._to_chinese8 mod = _S8 high, low = num // mod, num % mod if low == 0: return to8(high) + u'亿' else: if low < _S8 // 10: return to8(high) + u'亿零' + to8(low) else: return to8(high) + u'亿' + to8(low) #======================================== # 从列表页提取单章url, 然后下载单章文本 #======================================== from selenium import webdriver web = webdriver.Chrome() num2chinese = Num2Chinese() full_text="小说:嫡女媚" full_text=full_text+" " +" " +" " list_url="https://qilinchess.com/html/204/204846/" chapter_start=110 chapter_end=326 #326 for i in range(chapter_start,chapter_end+1): chinese_chapter_name="第"+num2chinese.convert(i)+"章" if chinese_chapter_name.find("百十"): chinese_chapter_name=chinese_chapter_name.replace("百十", "百一十") #print(chinese_chapter_name) web.get(list_url) #跳转会列表页, 以便抓取单页的url地址 url="" try: url=web.find_element_by_partial_link_text(chinese_chapter_name).get_attribute("href") except: url="" #print(url) if url!="": web.get(url) #<dd id="contents"> content_tag = web.find_elements_by_css_selector("dd")[2] #content_tag = web.find_element_by_id("contents") #content_tag = web.find_element_by_class_name("contents") content = content_tag.text else: content="不提供下载" chapter_text = " " + " " + " " + "======================" + " " + "第" + str(i) + "章" + " " chapter_text=chapter_text+content print(chapter_text) full_text=full_text+chapter_text #print(full_text) web.close()
#======================================== # 方法1: 数字转中文, 有缺陷,比如: 10将转成一零 #======================================== def num_to_char(num): """数字转中文""" num=str(num) new_str="" num_dict={"0":u"零","1":u"一","2":u"二","3":u"三","4":u"四","5":u"五","6":u"六","7":u"七","8":u"八","9":u"九"} listnum=list(num) # print(listnum) shu=[] for i in listnum: # print(num_dict[i]) shu.append(num_dict[i]) new_str="".join(shu) # print(new_str) return new_str #======================================== # 方法2: 数字转中文, 比较完美 #======================================== # ------------------------------------------------------------------------------- # Name: num2chinese # Author: yunhgu # Date: 2021/8/24 14:51 # Description: # ------------------------------------------------------------------------------- _MAPPING = (u'零', u'一', u'二', u'三', u'四', u'五', u'六', u'七', u'八', u'九',) _P0 = (u'', u'十', u'百', u'千',) _S4, _S8, _S16 = 10 ** 4, 10 ** 8, 10 ** 16 _MIN, _MAX = 0, 9999999999999999 class NotIntegerError(Exception): pass class OutOfRangeError(Exception): pass class Num2Chinese: def convert(self, number: int): """ :param number: :return:chinese number """ return self._to_chinese(number) def _to_chinese(self, num): if not str(num).isdigit(): raise NotIntegerError(u'%s is not a integer.' % num) if num < _MIN or num > _MAX: raise OutOfRangeError(u'%d out of range[%d, %d)' % (num, _MIN, _MAX)) if num < _S4: return self._to_chinese4(num) elif num < _S8: return self._to_chinese8(num) else: return self._to_chinese16(num) @staticmethod def _to_chinese4(num): assert (0 <= num < _S4) if num < 10: return _MAPPING[num] else: lst = [] while num >= 10: lst.append(num % 10) num = num // 10 lst.append(num) c = len(lst) # 位数 result = u'' for idx, val in enumerate(lst): if val != 0: result += _P0[idx] + _MAPPING[val] if idx < c - 1 and lst[idx + 1] == 0: result += u'零' return result[::-1].replace(u'一十', u'十') def _to_chinese8(self, num): assert (num < _S8) to4 = self._to_chinese4 if num < _S4: return to4(num) else: mod = _S4 high, low = num // mod, num % mod if low == 0: return to4(high) + u'万' else: if low < _S4 // 10: return to4(high) + u'万零' + to4(low) else: return to4(high) + u'万' + to4(low) def _to_chinese16(self, num): assert (num < _S16) to8 = self._to_chinese8 mod = _S8 high, low = num // mod, num % mod if low == 0: return to8(high) + u'亿' else: if low < _S8 // 10: return to8(high) + u'亿零' + to8(low) else: return to8(high) + u'亿' + to8(low) #======================================== # 从列表页提取单章url, 然后下载单章文本 #======================================== from selenium import webdriver web = webdriver.Chrome() num2chinese = Num2Chinese() full_text="小说:掌家小娘子" full_text=full_text+" " +" " +" " print(full_text) list_url="https://www.baihexs.com/0/54/" chapter_start=1 chapter_end=306 #306 for i in range(chapter_start,chapter_end+1): chinese_chapter_id=num2chinese.convert(i) #中文数字 #chinese_chapter_id=str(i) #阿拉伯数字 chinese_chapter_name="第"+chinese_chapter_id+"章" if chinese_chapter_name.find("百十"): chinese_chapter_name=chinese_chapter_name.replace("百十", "百一十") #print(chinese_chapter_name) web.get(list_url) #跳转会列表页, 以便抓取单页的url地址 url="" try: url=web.find_element_by_partial_link_text(chinese_chapter_name).get_attribute("href") except: url="" #print(url) if url: web.get(url) #<dd id="contents"> #//*[@id="content"] #content_tag = web.find_elements_by_css_selector("dd")[2] #content_tag = web.find_element_by_id("contents") #content_tag = web.find_element_by_class_name("container body-content") content_tag = web.find_element_by_xpath('''//*[@id="center"]''') content = content_tag.text else: content="不提供下载" chapter_text = " " + " " + " " + "======================" + " " + "第" + str(i) + "章" + " " chapter_text=chapter_text+content print(chapter_text) full_text=full_text+chapter_text #print(full_text) web.close()
================================
每章支持多个分页
================================
#======================================== # 方法1: 数字转中文, 有缺陷,比如: 10将转成一零 #======================================== def num_to_char(num): """数字转中文""" num=str(num) new_str="" num_dict={"0":u"零","1":u"一","2":u"二","3":u"三","4":u"四","5":u"五","6":u"六","7":u"七","8":u"八","9":u"九"} listnum=list(num) # print(listnum) shu=[] for i in listnum: # print(num_dict[i]) shu.append(num_dict[i]) new_str="".join(shu) # print(new_str) return new_str #======================================== # 方法2: 数字转中文, 比较完美 #======================================== # ------------------------------------------------------------------------------- # Name: num2chinese # Author: yunhgu # Date: 2021/8/24 14:51 # Description: # ------------------------------------------------------------------------------- _MAPPING = (u'零', u'一', u'二', u'三', u'四', u'五', u'六', u'七', u'八', u'九',) _P0 = (u'', u'十', u'百', u'千',) _S4, _S8, _S16 = 10 ** 4, 10 ** 8, 10 ** 16 _MIN, _MAX = 0, 9999999999999999 class NotIntegerError(Exception): pass class OutOfRangeError(Exception): pass class Num2Chinese: def convert(self, number: int): """ :param number: :return:chinese number """ return self._to_chinese(number) def _to_chinese(self, num): if not str(num).isdigit(): raise NotIntegerError(u'%s is not a integer.' % num) if num < _MIN or num > _MAX: raise OutOfRangeError(u'%d out of range[%d, %d)' % (num, _MIN, _MAX)) if num < _S4: return self._to_chinese4(num) elif num < _S8: return self._to_chinese8(num) else: return self._to_chinese16(num) @staticmethod def _to_chinese4(num): assert (0 <= num < _S4) if num < 10: return _MAPPING[num] else: lst = [] while num >= 10: lst.append(num % 10) num = num // 10 lst.append(num) c = len(lst) # 位数 result = u'' for idx, val in enumerate(lst): if val != 0: result += _P0[idx] + _MAPPING[val] if idx < c - 1 and lst[idx + 1] == 0: result += u'零' return result[::-1].replace(u'一十', u'十') def _to_chinese8(self, num): assert (num < _S8) to4 = self._to_chinese4 if num < _S4: return to4(num) else: mod = _S4 high, low = num // mod, num % mod if low == 0: return to4(high) + u'万' else: if low < _S4 // 10: return to4(high) + u'万零' + to4(low) else: return to4(high) + u'万' + to4(low) def _to_chinese16(self, num): assert (num < _S16) to8 = self._to_chinese8 mod = _S8 high, low = num // mod, num % mod if low == 0: return to8(high) + u'亿' else: if low < _S8 // 10: return to8(high) + u'亿零' + to8(low) else: return to8(high) + u'亿' + to8(low) def get_sub_page_url(chapter_url, sub_page_count, first_sub_index, sub_page_id): """ get sub_page url :param chapter_url: chapter url :param sub_page_count: total sub_page count of every chapter :param first_sub_index: first sub_page index with prefix _ :param sub_page_id: :return: """ if sub_page_count==0: return chapter_url else: if sub_page_id<first_sub_index: return chapter_url else: #https://www.mht99.com/98886/82000964.html # https://www.mht99.com/98886/82000964_1.html return chapter_url.replace(".html","_"+str(sub_page_id)+".html") #======================================== # 从列表页提取单章url, 然后下载单章文本 #======================================== from selenium import webdriver from selenium.common.exceptions import NoSuchElementException web = webdriver.Chrome() num2chinese = Num2Chinese() full_text="小说:秀才家的冲喜娘子" full_text=full_text+" " +" " +" " print(full_text) list_url="https://www.mht99.com/98886/" sub_page_count=4 #每章的页数 first_sub_index=1 #第一个子页url中的下标 chapter_start=1 chapter_end=480 #480 for i in range(chapter_start,chapter_end+1): # get chapter name #chinese_chapter_id=num2chinese.convert(i) #中文数字 chinese_chapter_id=str(i) #阿拉伯数字 chinese_chapter_name="第"+chinese_chapter_id+"章" if chinese_chapter_name.find("百十"): chinese_chapter_name=chinese_chapter_name.replace("百十", "百一十") #print(chinese_chapter_name) # 跳转到列表页, 以便抓取单页的url地址 web.get(list_url) #从列表也中获取每章的 url chapter_url="" try: chapter_url=web.find_element_by_partial_link_text(chinese_chapter_name).get_attribute("href") except: chapter_url="" #download 每章内容 if chapter_url: chapter_content="" #下载每一章的分页内容 for j in range(sub_page_count): try: sub_page_id = j+1 sub_page_url=get_sub_page_url(chapter_url, sub_page_count, first_sub_index,sub_page_id) #print("####第"+ str(sub_page_id)+":" +sub_page_url) web.get(sub_page_url) #<dd id="contents"> #//*[@id="content"] #content_tag = web.find_elements_by_css_selector("dd")[2] #content_tag = web.find_element_by_id("contents") #content_tag = web.find_element_by_class_name("container body-content") content_tag = web.find_element_by_xpath('''//*[@id="content"]''') chapter_content = chapter_content+ " " + " " chapter_content = chapter_content + content_tag.text except NoSuchElementException: print("####第"+ str(sub_page_id)+"页:" +"下载失败") else: page_content="不提供下载" #output page_title_text = " " + " " + " " + "======================" + " " + "第" + str(i) + "章" + " " chapter_full_text=page_title_text+chapter_content print(chapter_full_text) full_text=full_text+chapter_full_text #print(full_text) web.close()
================================
selenium 的更多信息
================================
selenium 不仅支持Python, 还支持Java/C#
https://www.selenium.dev/documentation/zh-cn/webdriver/driver_requirements/
https://www.selenium.dev/documentation/zh-cn/selenium_installation/installing_webdriver_binaries/