主程序
from urllib import request import gzip from lxml import etree import download_novelist url = "http://www.xbiquge.la/xiaoshuodaquan/" res = request.urlopen(url).read() try: data = gzip.decompress(res).decode() except: data = res.decode() ele = etree.HTML(data) #小说的名 book_names = ele.xpath("//div[@class='novellist']//ul/li/a/text()") #小说的url book_urls = ele.xpath("//div[@class='novellist']//ul/li/a/@href") #获得div for book_url in book_urls: download_novelist.download_ficition(book_url) # download_novelist.download_ficition("http://www.xbiquge.la/7/7931/")
download_novelist.py
# coding=utf-8 from urllib import request import gzip from lxml import etree import time def download_ficition(url): res = request.urlopen(url).read() try: data = gzip.decompress(res).decode() except: data = res.decode() ele = etree.HTML(data) #获得此url的小说名 book_name = ele.xpath("//div[@id='info']//h1/text()")[0] #获得此url的章节url地址列表 charpter_urls = ele.xpath("//div[@id='list']//dl/dd/a/@href") #初始化number,用于显示进度信息 number = 0 for charpter_url in charpter_urls: #沉睡2秒钟,速度过快,服务器无法相应 time.sleep(2) number = number + 1 #构建 章节 完整的url url2 = "http://www.xbiquge.la/"+charpter_url res2 = request.urlopen(url2).read() try: data2 = gzip.decompress(res2).decode() except: data2 = res2.decode() ele = etree.HTML(data2) #获得章节名 charpter_name = ele.xpath("//div[@class='bookname']/h1/text()")[0] #获得章节内容 charpter_content = ele.xpath("//div[@id='content']/text()") #去掉章节中的“全部章节字样” end_charpter_name = charpter_name.replace("全部章节 ", "") file = open("%s.txt"%(book_name), "a", encoding="utf-8") try: print("正在保存%s的%s;现存储了%d次;已经完成%f!"%(book_name,charpter_name,number,number-1/len(charpter_urls))) file.write(end_charpter_name+' ') for s in charpter_content: file.write("".join(s.split())+' ') except Exception as e: print("%s小说保存失败!"%(book_name)) finally: file.close()