zoukankan html css js c++ java

python爬虫分章节保存小说

#coding:utf-8
import requests,os
from bs4 import BeautifulSoup

class downloader():

    def __init__(self):
        self.urls = []  # 保存章节链接
        self.name = []  # 保存章节名


    def Response(self):
        response = requests.get(url)
        response.encoding = 'gbk'  # 解决乱码
        self.soup = BeautifulSoup(response.text, 'lxml')  # 解析网页
        div = self.soup.find_all('div', class_='listmain')  # 在解析结果中查找class_='listmain'
        soup1 = BeautifulSoup(str(div), 'lxml')  # 删除字符串头和尾的空格
        h = soup1.find_all('a')  # 在class_='listmain下面找到a标签
        for i in h:
            self.name.append(i.string)  # 将a标签中的非属性字符，即章节名添加到name
            self.urls.append('https://www.biqugex.com%s' % i.get('href'))  # 将a标签中的链接，添加到urls
        return url

    def file(self):
        """查找小说名字，并创建同名文件夹"""
        div1 = self.soup.select('body > div.book > div.info > h2')
        a = BeautifulSoup(str(div1), 'lxml')
        b = a.find('h2')
        b = b.string
        c = 'C:\Users\Administrator\Desktop\%s' % b
        if not os.path.exists(c):
            os.mkdir(c)

        # 循环解析urls，得到小说正文
        i = 0
        while i < len(self.urls):
            response1 = requests.get(url=self.urls[i])
            response1.encoding = 'gbk'
            soup2 = BeautifulSoup(response1.text, 'lxml')
            d = soup2.find_all('div', id='content')
            id1 = BeautifulSoup(str(d), 'lxml')
            # 创建文件名
            src = self.name[i] + '.txt'
            filename = c + '/' + src
            print(filename)

            # 将解析到的小说正文写到文件中
            for result in id1:
                res = result.text
                id2 = soup2.select('#content')
                with open(filename, 'w+', encoding='utf-8') as f:
                    f.write(res)
                i += 1
#如果输入的网址不是正确的网址，则提示请输入正确的笔趣阁网址
    def Try(self):
        try:
            url ='https://www.biqugex.com/book_104027/'
            b=downloader()
            b.Response()
            b.file()
        except:
            print('请输入正确的笔趣阁网址')


if __name__ == '__main__':
    url=input('请输入网址：')
    # url='https://www.biqugexcom/book_104027/'
    a = downloader()
    a.Try()

查看全文

相关阅读:
typescript 箭头表达式
 typescript 参数类型
 ts介绍
 pm2
koa2安装
 linux 搭建ftp
CENTOS6.5 安装 mysql5.6 以及搭建双主
 bzoj 3043 （差分序列运用）
poj 3277 City Horizon
NOI2015 程序自动分析

原文地址：https://www.cnblogs.com/hfct/p/11652007.html