zoukankan      html  css  js  c++  java
  • Python爬虫练习:抓取笔趣阁小说(一)

    练习使用requests BeautifulSoup 抓取一本小说存放到D盘中

    速度比较慢、抓取服务器容易中断

    # -*- coding:UTF-8 -*-
    import requests
    from bs4 import BeautifulSoup
    import re
    
    """
    获取书籍目录
    """
    
    
    def getBookContents(url):
        req = requests.get(url=url)
        req.encoding = "gb2312"
        html = req.text
    
        dv_bf = BeautifulSoup(html, "html5lib")
        dv = dv_bf.find("div", class_="listmain")
        # dvs = dv_bf.find_all("div", class_="listmain")
    
        a_bf = BeautifulSoup(str(dv), "html5lib")
        a = a_bf.find_all("a")
    
        book_contents_list = []
        i = 0
        for content in a[13:]:
            book_title = content.string
            book_url = content.get("href")
            try:
                # 数据清洗 获取标题"章"字索引 若没有则出现异常 不记录数据
                book_title_index = str(book_title).index("章", 0)
                # print(book_title_index)
                # 通过index切片 获取新的章节标题
                new_book_title = book_title[book_title_index + 1:]
                # print(new_book_title)
                # 去除标题含有的空格
                i = i + 1
                new_book_titles = "第{}章".format(i) + new_book_title.lstrip()
                new_book_url = "http://www.biqukan.com{}".format(book_url)
                #print(new_book_titles, new_book_url)
    
                # 一组数据设置为字典类型
                contenets = {new_book_titles: new_book_url}
                # 存放到list
                book_contents_list.append(contenets)
            except:
                # 通过异常捕捉,出现异常是没有找到"章"字符索引
                print("*****************不是正文章节节点,不予记录****************")
                print("原标题=", book_title)
                print("原链接=", new_book_url)
        return book_contents_list
    
    
    """
    通过文章链接地址获取章节内容
    """
    
    
    def getConnect(url):
        target = 'http://www.biqukan.com/1_1094/5403177.html'
        req = requests.get(url=url)
        req.encoding = 'gb2312'
        html = req.text
        div_bf = BeautifulSoup(html, "html5lib")
        div = div_bf.find("div", id="content")
        # 去除script
        [s.extract() for s in div('script')]
        # print(div.text)
        return div.text
    
    
    """
    将小说内容写入到文件
    """
    
    
    def saveData(filepath, text):
        with open(filepath, mode="w", encoding="UTF-8") as f:
            f.writelines(text)
            f.write('
    
    ')
    
    
    if __name__ == '__main__':
    
        book_list = getBookContents("http://www.biqukan.com/1_1094")
    
        for li in book_list:
                filepath = "d:\123\"
                connecturl = ""
                for aa in  li.keys():
                    filepath = filepath+aa
                    connecturl = li[aa]
    
                text = getConnect(connecturl)
                saveData(filepath,text)
    

     

  • 相关阅读:
    单例模式
    HashSet、LinkedHashSet、SortedSet、TreeSet
    ArrayList、LinkedList、CopyOnWriteArrayList
    HashMap、Hashtable、LinkedHashMap
    andrew ng machine learning week8 非监督学习
    andrew ng machine learning week7 支持向量机
    andrew ng machine learning week6 机器学习算法理论
    andrew ng machine learning week5 神经网络
    andrew ng machine learning week4 神经网络
    vue组件监听属性变化watch方法报[Vue warn]: Method "watch" has type "object" in the component definition. Did you reference the function correctly?
  • 原文地址:https://www.cnblogs.com/dangzhengtao/p/12213513.html
Copyright © 2011-2022 走看看