zoukankan      html  css  js  c++  java
  • python 下载小说

    最近迷上了三体小说,网上小说基本上都是分章节一篇一篇的人肉ctrl c v实在是太low了。干脆自己写个脚本吧,一劳永逸。

    基本思路:

    • 1、获取小说首页所有的章节名称和链接
    • 2、使用异步请求所有的章节网页
    • 3、根据网页内容使用xpath提取章节文本,再分章节存储

    实现如下:

    """
    ===================================
        -*- coding:utf-8 -*-
        Author     :GadyPu
        E_mail     :Gadypy@gmail.com
        Time       :2020/10/7 0007 上午 11:59
        FileName   :spider.py
    ===================================
    """
    import os
    import re
    import sys
    import requests
    from lxml import etree
    import asyncio
    import aiohttp
    from queue import Queue
    import threading
    
    class GetNovels(object):
    
        def __init__(self, url, name):
            self.headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'
            }
            self.novel_url = url
            self.htmlQ = asyncio.Queue()
            self.chapterQ = Queue()
            self.novel_name = name
    
        def get_chapter_urls(self):
            try:
                response = requests.get(url = self.novel_url, headers = self.headers)
                html = etree.HTML(response.content.decode('utf-8'))
                titles = html.xpath(r'//div[@class="book-list clearfix"]/ul/li/a/text()')
                links = html.xpath(r'//div[@class="book-list clearfix"]/ul/li/a/@href')
                for title, link in zip(titles, links):
                    self.htmlQ.put_nowait((title, link))
            except Exception as e:
                print(e, '
    ', 'network error cannot parse chapter url')
                sys.exit()
    
        async def fetch(self):
            try:
                async with aiohttp.ClientSession(connector = aiohttp.TCPConnector(ssl = False)) as session:
                    while not self.htmlQ.empty():
                        data = await self.htmlQ.get()
                        async with session.get(url = data[1], headers = self.headers) as response:
                            if response.status == 200:
                                html = await response.read()
                                self.chapterQ.put((data[0], html.decode('utf-8')))
                                await asyncio.sleep(0.3)
            except Exception as e:
                print(e, '
    ', 'network error cannot fetch chapters...')
                sys.exit()
    
        def parse_chapter(self, path, id):
            while True:
                data = self.chapterQ.get()
                if not data:
                    break
                temp = data[0].split(' ')
                html = etree.HTML(data[1])
                # 获取p标签下所有文本
                content = html.xpath(r'//*[@id="nr1"]/p//text()')
                chapter = html.xpath(r'//*[@id="bcrumb"]/span[5]/a/text()')[0]
                chapter_dir = os.path.join(path, chapter)
    
                if not os.path.exists(chapter_dir):
                    os.makedirs(chapter_dir)
                chapter_name = os.path.join(chapter_dir, re.sub('[/:*?"<>|]', '-', ' '.join(temp)))
    
                print(f'thread:{id} is parsing: ' + ' '.join(temp))
                with open(chapter_name + '.txt', 'w+', encoding = 'utf-8') as wf:
                    wf.write(' '.join(temp) + '
    
    ')
                    for cont in content:
                        wf.write(str(cont) + '
    ')
    
        def run(self):
    
            self.get_chapter_urls()
            loop = asyncio.get_event_loop()
            # 为了防止爬的过快控制并发数量
            tasks = [self.fetch() for _ in range(20)]
    
            path = os.path.join(os.getcwd(), self.novel_name)
            if not os.path.exists(path):
                os.makedirs(path)
            thread_lists = []
            for i in range(3):
                t = threading.Thread(target = self.parse_chapter, args = (path, i + 1))
                t.setDaemon(True)
                thread_lists.append(t)
                t.start()
            loop.run_until_complete(asyncio.wait(tasks))
            [self.chapterQ.put_nowait(None) for _ in range(3)]
            [i.join() for i in thread_lists]
    
    if __name__ == '__main__':
    
        url = 'https://www.luoxia.com/santi/'
        name = '三体'
        d = GetNovels(url, name)
        d.run()
    

      

  • 相关阅读:
    JavaScript基础知识-forEach循环
    JavaScript基础知识-数组的练习
    JavaScript基础知识-数组的遍历
    JavaScript基础知识-数组的常用方法
    JavaScript基础知识-数组基于索引访问
    JavaScript基础知识-数组的定义方式
    JavaScript基础知识-垃圾回收
    JavaScript基础知识-toString()
    JavaScript基础知识-原型(prototype)
    JavaScript基础知识-构造函数(也称为"类")定义
  • 原文地址:https://www.cnblogs.com/GadyPu/p/13777270.html
Copyright © 2011-2022 走看看