zoukankan      html  css  js  c++  java
  • 爬虫习作-爬小说

    # coding=utf8
    import traceback
    from bs4 import BeautifulSoup
    import requests
    import re
    import time
    
    #get novel information!  https://www.bxwxorg.com/
    
    def getHTMLText(url):
        try:
            r = requests.get(url)
            r.raise_for_status()
            r.encoding = r.apparent_encoding#程序分析源码,使用可能的编码进行解码
            return r.text #正常,返回网页源码,类型str
        except:
            return ""#异常,返回空
    
    def getList(lst,URL,):
        html = getHTMLText(URL)
        soup = BeautifulSoup(html, "html.parser")#创建 beautifulsoup 对象
        a = soup.find_all('a')#找到所有a链接
        for i in a:
            try:
                href = i.attrs['href']#拿到具体的链接地址
                if (re.findall(r"d{1,8}", href)[1]) not in lst:
                    lst.append(re.findall(r"d{1,8}", href)[1])#找到小说章节的链接
            except:
                continue
        lst.sort(reverse=False)#给列表排序
        print("小说共%s章节:"%len(lst))
        return lst
    
    def getInfo(lst,articlURL,path):
        timeStart = time.time()  # 抓取计时
        for articlNum in lst:
            url = articlURL + articlNum+".html"
            print(url)
            html = getHTMLText(url)#获取文章详情页html
            try:
                if html == "":#如果404等非200导致的空源码,就不提取
                    print("%s信息不存在!")
                soup = BeautifulSoup(html, "html.parser")
                title = soup.body['article-name']#小说名字
                chapter = soup.find('h1').text  # 章节名称
                print("======开始爬取<%s>信息======" % chapter)
                fpath = path + title + ".txt"
                content = soup.find_all('div', id= 'content')
                with open(fpath, 'a', encoding='utf-8') as f:
                    for i in content:
                        text = i.text.replace('
    ', '
    ')#
    替换成换行
                        f.write(""+chapter+""+text+"
    "*2)
            except:
                traceback.print_exc()
                print("======error======")
        print("耗时:%s 秒" %(time.time() - timeStart))
    
    #主函数
    lst = []
    url = "https://www.bxwxorg.com/read/129669/"
    getList(lst,url)
    lst = lst[:len(lst)] #打印所有章节
    # lst = lst[:5]#打印前5节
    path = "F:\workspace\API_test\Crawlers\Info\"
    getInfo(lst,url,path)

     经过几天的努力,终于小有收获!happy~

  • 相关阅读:
    判断java中两个对象是否相等
    面试题记录
    springboot集成redis操作
    Java 之Integer相等比较
    CSS+DIV网页样式与布局:第二章:CSS的基本语法
    JSP标签:jsp内置标签、jstl标签、自定义标签
    jsp jstl标签库 el表达式
    mysql数据库修改字段类型
    读CSS DIV网页样式与布局心得体会
    Absolute(绝对定位)与relative(相对定位)的图文讲解
  • 原文地址:https://www.cnblogs.com/bubutianshu/p/14267235.html
Copyright © 2011-2022 走看看