zoukankan      html  css  js  c++  java
  • python爬虫分章节保存小说

    #coding:utf-8
    import requests,os
    from bs4 import BeautifulSoup
    
    class downloader():
    
        def __init__(self):
            self.urls = []  # 保存章节链接
            self.name = []  # 保存章节名
    
    
        def Response(self):
            response = requests.get(url)
            response.encoding = 'gbk'  # 解决乱码
            self.soup = BeautifulSoup(response.text, 'lxml')  # 解析网页
            div = self.soup.find_all('div', class_='listmain')  # 在解析结果中查找class_='listmain'
            soup1 = BeautifulSoup(str(div), 'lxml')  # 删除字符串头和尾的空格
            h = soup1.find_all('a')  # 在class_='listmain下面找到a标签
            for i in h:
                self.name.append(i.string)  # 将a标签中的非属性字符,即章节名添加到name
                self.urls.append('https://www.biqugex.com%s' % i.get('href'))  # 将a标签中的链接,添加到urls
            return url
    
        def file(self):
            """查找小说名字,并创建同名文件夹"""
            div1 = self.soup.select('body > div.book > div.info > h2')
            a = BeautifulSoup(str(div1), 'lxml')
            b = a.find('h2')
            b = b.string
            c = 'C:\Users\Administrator\Desktop\%s' % b
            if not os.path.exists(c):
                os.mkdir(c)
    
            # 循环解析urls,得到小说正文
            i = 0
            while i < len(self.urls):
                response1 = requests.get(url=self.urls[i])
                response1.encoding = 'gbk'
                soup2 = BeautifulSoup(response1.text, 'lxml')
                d = soup2.find_all('div', id='content')
                id1 = BeautifulSoup(str(d), 'lxml')
                # 创建文件名
                src = self.name[i] + '.txt'
                filename = c + '/' + src
                print(filename)
    
                # 将解析到的小说正文写到文件中
                for result in id1:
                    res = result.text
                    id2 = soup2.select('#content')
                    with open(filename, 'w+', encoding='utf-8') as f:
                        f.write(res)
                    i += 1
    #如果输入的网址不是正确的网址,则提示请输入正确的笔趣阁网址
        def Try(self):
            try:
                url ='https://www.biqugex.com/book_104027/'
                b=downloader()
                b.Response()
                b.file()
            except:
                print('请输入正确的笔趣阁网址')
    
    
    if __name__ == '__main__':
        url=input('请输入网址:')
        # url='https://www.biqugexcom/book_104027/'
        a = downloader()
        a.Try()
  • 相关阅读:
    Aseprite+Cocos:打包像素画图,导入到cocos里并动起来
    自定义博客园个人皮肤
    埃航和737MAX坠毁:软件优先级问题
    淘宝网——软件质量属性场景分析
    王概凯《架构漫谈》阅读笔记
    2965 -- The Pilots Brothers' refrigerator
    UVa10082 -- WERTYU
    1753 -- Flip Game
    1083 -- Moving Tables
    2159 -- Ancient Cipher
  • 原文地址:https://www.cnblogs.com/hfct/p/11652007.html
Copyright © 2011-2022 走看看