zoukankan html css js c++ java

python 下载小说

最近迷上了三体小说，网上小说基本上都是分章节一篇一篇的人肉ctrl c v实在是太low了。干脆自己写个脚本吧，一劳永逸。

基本思路：

1、获取小说首页所有的章节名称和链接
2、使用异步请求所有的章节网页
3、根据网页内容使用xpath提取章节文本，再分章节存储

实现如下：

"""
===================================
    -*- coding:utf-8 -*-
    Author     :GadyPu
    E_mail     :Gadypy@gmail.com
    Time       :2020/10/7 0007 上午 11:59
    FileName   :spider.py
===================================
"""
import os
import re
import sys
import requests
from lxml import etree
import asyncio
import aiohttp
from queue import Queue
import threading

class GetNovels(object):

    def __init__(self, url, name):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36'
        }
        self.novel_url = url
        self.htmlQ = asyncio.Queue()
        self.chapterQ = Queue()
        self.novel_name = name

    def get_chapter_urls(self):
        try:
            response = requests.get(url = self.novel_url, headers = self.headers)
            html = etree.HTML(response.content.decode('utf-8'))
            titles = html.xpath(r'//div[@class="book-list clearfix"]/ul/li/a/text()')
            links = html.xpath(r'//div[@class="book-list clearfix"]/ul/li/a/@href')
            for title, link in zip(titles, links):
                self.htmlQ.put_nowait((title, link))
        except Exception as e:
            print(e, '
', 'network error cannot parse chapter url')
            sys.exit()

    async def fetch(self):
        try:
            async with aiohttp.ClientSession(connector = aiohttp.TCPConnector(ssl = False)) as session:
                while not self.htmlQ.empty():
                    data = await self.htmlQ.get()
                    async with session.get(url = data[1], headers = self.headers) as response:
                        if response.status == 200:
                            html = await response.read()
                            self.chapterQ.put((data[0], html.decode('utf-8')))
                            await asyncio.sleep(0.3)
        except Exception as e:
            print(e, '
', 'network error cannot fetch chapters...')
            sys.exit()

    def parse_chapter(self, path, id):
        while True:
            data = self.chapterQ.get()
            if not data:
                break
            temp = data[0].split(' ')
            html = etree.HTML(data[1])
            # 获取p标签下所有文本
            content = html.xpath(r'//*[@id="nr1"]/p//text()')
            chapter = html.xpath(r'//*[@id="bcrumb"]/span[5]/a/text()')[0]
            chapter_dir = os.path.join(path, chapter)

            if not os.path.exists(chapter_dir):
                os.makedirs(chapter_dir)
            chapter_name = os.path.join(chapter_dir, re.sub('[/:*?"<>|]', '-', ' '.join(temp)))

            print(f'thread:{id} is parsing: ' + ' '.join(temp))
            with open(chapter_name + '.txt', 'w+', encoding = 'utf-8') as wf:
                wf.write(' '.join(temp) + '

')
                for cont in content:
                    wf.write(str(cont) + '
')

    def run(self):

        self.get_chapter_urls()
        loop = asyncio.get_event_loop()
        # 为了防止爬的过快控制并发数量
        tasks = [self.fetch() for _ in range(20)]

        path = os.path.join(os.getcwd(), self.novel_name)
        if not os.path.exists(path):
            os.makedirs(path)
        thread_lists = []
        for i in range(3):
            t = threading.Thread(target = self.parse_chapter, args = (path, i + 1))
            t.setDaemon(True)
            thread_lists.append(t)
            t.start()
        loop.run_until_complete(asyncio.wait(tasks))
        [self.chapterQ.put_nowait(None) for _ in range(3)]
        [i.join() for i in thread_lists]

if __name__ == '__main__':

    url = 'https://www.luoxia.com/santi/'
    name = '三体'
    d = GetNovels(url, name)
    d.run()

查看全文

相关阅读:
Angularjs中的缓存以及缓存清理
 举例子来说明Python引用和对象
 对象关系映射ORM
Apache Storm 核心概念
 Linux如何查看哪个进程占用的SWAP分区比较多？
MySQL彻底清除slave信息
 监控MySQL的时候监控用户应该怎么授权？
MySQL用户密码修改
 专职DBA-Zabbix 3.0 for percona-server TokuDB
防止rm强制删除

原文地址：https://www.cnblogs.com/GadyPu/p/13777270.html