zoukankan      html  css  js  c++  java
  • 爬取笔趣阁网站小说

    主程序

    from urllib import request
    import gzip
    from lxml import etree
    import download_novelist
    
    url = "http://www.xbiquge.la/xiaoshuodaquan/"
    
    res = request.urlopen(url).read()
    
    try:
        data = gzip.decompress(res).decode()
    except:
        data = res.decode()
    
    ele = etree.HTML(data)
    
    #小说的名
    book_names = ele.xpath("//div[@class='novellist']//ul/li/a/text()")
    #小说的url
    book_urls = ele.xpath("//div[@class='novellist']//ul/li/a/@href")
    
    #获得div
    for book_url in book_urls:
        download_novelist.download_ficition(book_url)
    
    # download_novelist.download_ficition("http://www.xbiquge.la/7/7931/")
    download_novelist.py

    # coding=utf-8
    from urllib import request
    import gzip
    from lxml import etree
    import time
    
    def download_ficition(url):
    
        res = request.urlopen(url).read()
        try:
            data = gzip.decompress(res).decode()
        except:
            data = res.decode()
    
        ele = etree.HTML(data)
    
    #获得此url的小说名
        book_name = ele.xpath("//div[@id='info']//h1/text()")[0]
    #获得此url的章节url地址列表
        charpter_urls = ele.xpath("//div[@id='list']//dl/dd/a/@href")
    
    #初始化number,用于显示进度信息
        number = 0
    
        for charpter_url in charpter_urls:
            #沉睡2秒钟,速度过快,服务器无法相应
            time.sleep(2)
            number = number + 1
           #构建 章节 完整的url
            url2 = "http://www.xbiquge.la/"+charpter_url
    
            res2 = request.urlopen(url2).read()
            try:
                data2 = gzip.decompress(res2).decode()
            except:
                data2 = res2.decode()
    
            ele = etree.HTML(data2)
            #获得章节名
            charpter_name = ele.xpath("//div[@class='bookname']/h1/text()")[0]
            #获得章节内容
            charpter_content = ele.xpath("//div[@id='content']/text()")
            #去掉章节中的“全部章节字样”
            end_charpter_name = charpter_name.replace("全部章节 ", "")
    
            file = open("%s.txt"%(book_name), "a", encoding="utf-8")
    
            try:
                print("正在保存%s的%s;现存储了%d次;已经完成%f!"%(book_name,charpter_name,number,number-1/len(charpter_urls)))
                file.write(end_charpter_name+'
    ')
                for s in charpter_content:
                    file.write("".join(s.split())+'
    ')
    
            except Exception as e:
                print("%s小说保存失败!"%(book_name))
            finally:
                file.close()
     
    非学无以广才,非志无以成学。 正是因为今天的不完美,才对未来充满希望。 ----长帆
  • 相关阅读:
    laravel5.* 生成key
    Laravel 调试利器 Laravel Debugbar 扩展包安装及使用教程
    JS相关
    Git常用命令(全)
    linux添加计划任务
    30个php操作redis常用方法代码例子(转载)
    获取服务器IP和客户端IP
    PHP-redis中文文档(相关)
    常用算法排序
    软件下载(汇总)
  • 原文地址:https://www.cnblogs.com/changfan/p/12184991.html
Copyright © 2011-2022 走看看