zoukankan      html  css  js  c++  java
  • 爬取并下载「旧时光文学」小说

    爬取小说全部章节,所以要在小说目录页进行爬取。只是涉及到文字,所以用 Xpath 解析。

    # -*- coding: utf-8 -*-
    # @Time    : 2020/6/21 11:09
    # @Author  : banshaohuan
    # @Site    :
    # @File    : pa_xiaoshuo.py
    # @Software: PyCharm
    import requests
    from lxml import etree
    from fake_useragent import UserAgent
    
    # 设置headers
    ua = UserAgent()
    headers = {
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "zh_CN,zh;q=0.9",
        "Connection": "close",
        "User-Agent": ua.random,
    }
    
    # 获取url链接的xml格式
    def get_xml(url):
        res = requests.get(url, headers, timeout=10)
        res.encoding = res.apparent_encoding
        text = res.text
        xml = etree.HTML(text)
        return xml
    
    
    # 获取文章内容
    def get_content(xml, f):
        data = xml.xpath('//div[@class="readcontent"]/text()')
        for text in data:
            if text == "
    ":
                pass
            else:
                f.write(text.replace("
    ", ""))
    
    
    def download_book(url):
        xml_list = get_xml(url)
        name = xml_list.xpath("//h1/text()")
        # 章节名
        chapters = xml_list.xpath('//div[@id="list-chapterAll"]//dd/a/text()')
        links = xml_list.xpath('//div[@id="list-chapterAll"]//dd/a/@href')
    
        print(f"《{name[0]}》获取中,共{len(links)}章")
    
        file_name = f"D:/{name[0]}.txt"
    
        with open(file_name, "w", encoding="utf-8") as f:
            for i in range(0, len(links) - 200):
                f.write("
    ")
                f.write(chapters[i])
                url_text = f"{url}{links[i]}"
                xml_content = get_xml(url_text)
                page = xml_content.xpath('//div[@class="book read"]//small/text()')
                get_content(xml_content, f)
                if "(1/2)" in page:
                    url_text2 = f"{url_text[0:-5]}_2.html"
                    xml_content2 = get_xml(url_text2)
                    get_content(xml_content2, f)
                print(f"{chapters[i]}:已完成")
        print("下载完成")
    
    
    if __name__ == "__main__":
        # url为小说目录页
        url = "https://www.oldtimescc.cc/go/16078/"
        download_book(url)
    
    

    参考:https://www.52pojie.cn/thread-1200971-1-1.html

  • 相关阅读:
    nginx-syslog
    loki
    idea安装中文插件
    nginx虚拟目录alias
    个人 软件系统整理
    Python 遍历Sheet 每个Sheet都单独保存为一个Excel
    SQL Server 多表关联的update语句
    电商 生意参谋 抓取 访客数据 JS版/谷歌插件版
    EF 多表关联
    个人 圈外同学 对比分析
  • 原文地址:https://www.cnblogs.com/banshaohuan/p/13172448.html
Copyright © 2011-2022 走看看