zoukankan      html  css  js  c++  java
  • 爬取笔趣阁_完本书籍

    爬取笔趣阁_完本书籍

    爬取地址

    import os, time, shutil, requests, sqlite3
    from bs4 import BeautifulSoup
    from threading import Thread
    from datetime import datetime
    
    
    def fun_makedir(file_path):
        """
        创建文件夹,并进入该文件夹
        :return:
        """
        if not os.path.exists(file_path):
            os.makedirs(file_path)
        os.chdir(file_path)
    
    
    def main():
        """
        主函数
        :return:
        """
        create_db()  # 创建数据库
        start = datetime.now()
        book_urls = []
        url = "http://www.biquge.tv/wanben/1_1"  # 抓取入口url
    
        book_urls = get_book(url)  # 获取书名、书的url
        total_book = len(book_urls)
        threads = []
        book_id = 0
        for item in book_urls:
            book_id += 1
            t = Thread(target=save_book_todb, args=(book_id, item[0], item[1]))
            threads.append(t)
        thread_no = 0
        for t in threads:
            thread_no += 1
            print("下载进度:[{}{}]".format(">" * (thread_no), "." * (total_book - thread_no)))  # 打印进度条
            t.start()  # 开始线程
            time.sleep(5)  # print(delay)
            time.sleep(delay)  # 抓取一本书5000章节,大概需要80秒。delay=80
        for t in threads:
            t.join()
            print(t)
        print("
    共抓取{}部小说	".format(count))
        run_time = (datetime.now() - start).total_seconds()
        print("总共用时{}秒".format(run_time), end="	")
        print("{}正在导出小说".format(">" * 100))
        show_books()
    
    
    def create_db():
        """
        创建数据库
        :return:
        """
        if os.path.exists(dbname): os.remove(dbname)
        conn = sqlite3.connect(dbname)
        conn.close()
    
    
    def set_delay(total_chapter):
        """
        根据章节数量,设置延迟时间
        :param total_chapter:
        :return:
        """
        base = 30
        if total_chapter > 5000:
            delay = base * 6
        elif total_chapter > 4000:
            delay = base * 5
        elif total_chapter > 3000:
            delay = base * 4
        elif total_chapter > 2000:
            delay = base * 3
        elif total_chapter > 1000:
            delay = base * 2
        else:
            delay = base
        return delay
    
    
    def get_book(url):
        """
        获取书名、书的地址
        :param url:
        :return:(书名,书的地址)
        """
        books = []
        response = requests.get(url, headers=headers)
        response.encoding = "gbk"
        soup = BeautifulSoup(response.text, "html.parser")
        book_txts = soup.find('div', class_='r').findAll('li')
        for book in book_txts:
            book_url = book.find('a')['href']
            book_name = book.find('a').get_text()
            # print("{:<40s}{:<60s}".format(book_name, book_url))
            books.append([book_name, book_url])
        return books
    
    
    def get_chapter(book_url):
        """
        获取章节名、章节地址
        :param book_url:
        :return:章节名、章节地址
        """
        chapters = []
        chapter_res = requests.get(book_url, headers=headers)
        chapter_res.encoding = 'gbk'
        chapter_soup = BeautifulSoup(chapter_res.text, "html.parser")
        chs = chapter_soup.find('div', id="list").findAll('dd')
        max_chapter = len(chs)
        for i in range(9, max_chapter):
            chapter = chs[i].find('a')
            chapter_url = "http://www.biquge.tv" + chapter['href']
            chapter_name = chapter.get_text()
            chapters.append([chapter_name, chapter_url])
        return chapters
    
    
    def save_book_todb(book_id, book_name, book_url):
        """
        获取书籍数据,保存到数据库
        :param book_id:
        :param book_name:
        :param book_url:
        :return:
        """
        global count, delay
        count = count + 1
        chapters = []
        chapters = get_chapter(book_url)  # 获取所有章节
        total_chapter = len(chapters)
        delay = set_delay(total_chapter)  # 设置休息时间
        print("正在下载----小说 {}{:<2s}{},共有{}章节,请等待{}秒".format('>' * 50, str(book_id), book_name, total_chapter, delay))
        chapter_id = 0
        threads = []
        for item in chapters:
            chapter_id += 1
            t = Thread(target=save_chapter_todb, args=(chapter_id, item[0], item[1], book_id, book_name, book_url))
            threads.append(t)  # save_chapter_todb(chapter_id, item[0], item[1],book_id, book_name, book_url)
        for t in threads:
            t.start()
            time.sleep(0.01)
        for t in threads:
            t.join()
        print("下载完成 {}{:<2s}{},共有{}章节".format('*' * 30, str(book_id), book_name, len(chapters)))
    
    
    def save_chapter_todb(chapter_id, chapter_name, chapter_url, book_id, book_name, book_url):
        """
        获取章节内容,并保存到数据库
        :param chapter_id:
        :param chapter_name:
        :param chapter_url:
        :param book_id:
        :param book_name:
        :param book_url:
        :return:
        """
        down_chapter_res = requests.get(chapter_url, headers=headers)
        down_chapter_res.encoding = 'gbk'
        down_chapter_soup = BeautifulSoup(down_chapter_res.text, "html.parser")
        chapter_text = down_chapter_soup.find('div', id="content")
        chapter_text = chapter_text.text  # 获取html中的文本
        chapter_texts = ""
        for s in chapter_text.splitlines(
                True):  # 去除空行,去除每行的单引号  # chapter_text = "".join([s for s in chapter_text.splitlines(True) if s.strip()])
            s.strip()  # 替换空格和空行
            s = s.replace("'", "''")  # 单引号替换为双引号
            chapter_texts += s
        chapter_text = chapter_texts
        save_db(chapter_id, chapter_name, chapter_text, chapter_url, book_id, book_name, book_url)
    
    
    def save_db(chapter_id, chapter_name, chapter_text, chapter_url, book_id, book_name, book_url):
        """
        保存数据到数据库
        :param chapter_id:
        :param chapter_name:
        :param chapter_text:
        :param chapter_url:
        :param book_id:
        :param book_name:
        :param book_url:
        :return:
        """
        try:
            table_name = create_table_book(book_id)
            conn = sqlite3.connect(dbname)
            cursor = conn.cursor()
            sql = "insert into " + table_name + " values('%d','%s','%s','%s','%s','%s','%s')" % (
                chapter_id, chapter_name, chapter_text, chapter_url, table_name, book_name, book_url)
            cursor.execute(sql)
            conn.commit()
            cursor.close()
            conn.close()
        except:
            print(
                "保存章节出错 {}书名:{},章节{:<4s}{},章节链接:{}".format('.' * 10, book_name, str(chapter_id), chapter_name, chapter_url))
    
    
    def create_table_book(table_id):
        """
        创建表
        :param table_id:
        :return:表名
        """
        conn = sqlite3.connect(dbname)
        cursor = conn.cursor()
        table_name = "book_" + str(table_id)
        sql = "create table IF NOT EXISTS " + table_name + "(chapter_id int,chapter_name varchar(20)," + 
              "chapter_text varchar(10000),chapter_url varchar(60),book_id varchar(20),book_name varchar(100),book_url varchar(100))"  # 表不存在,就创建;存在就跳过
        cursor.execute(sql)
        conn.commit()
        cursor.close()
        conn.close()
        return table_name
    
    
    def show_books():
        """
        导出数据库中所有书籍
        :return:
        """
        start = datetime.now()
        conn = sqlite3.connect(dbname)
        cursor = conn.cursor()
        cursor.execute("select count(*) from sqlite_master where tbl_name like 'book_%'")
        num = cursor.fetchone()
        cursor.close()
        conn.close()
        total = num[0] + 1  # print(num[0])
        threads = []
        count = 0
        for i in range(1, total):
            t = Thread(target=show_book, args=(i,))  # show_book(i)
            threads.append(t)
            count += 1
        for t in threads:
            t.start()
            time.sleep(0.1)
        for t in threads:
            t.join()
        run_time = (datetime.now() - start).total_seconds()
        print("
    
    导出小说完成,共导出{}部小说。".format(count), end="	")
        print("花费{}秒。".format(run_time), end="
    
    ")
    
    
    def show_book(table_id):
        """
        导出单本书籍
        :param table_id:
        :return:
        """
        conn = sqlite3.connect(dbname)
        cursor = conn.cursor()
        sql = "select book_id,book_name,chapter_name,chapter_text,chapter_id,book_url from book_" + str(
            table_id) + " order by chapter_id"
        cursor.execute(sql)
        results = cursor.fetchall()
        file_name = results[0][0] + " " + results[0][1] + "[共" + str(len(results)) + "章]" + ".txt"
        if os.path.exists(file_name): os.remove(file_name)
        is_first = True
        print("正在导出小说>>>{}".format(file_name))
        for r in results:
            with open(file_name, 'a', encoding='utf-8') as f:
                if is_first:  # 首行写入书名
                    f.write("{}完本小说{}
    
    {}**【{}】**
    {}共有{}章节
    {}在线阅读网址:{}
    
    {}"
                            .format("*"*40,"*"*40," "*20,results[0][1]," "*20, len(results), " "*20,results[0][5],"*"*90))
                    is_first = False
                f.write("{}第{}章:{}{}".format("
    
    ", r[4], r[2], "
    
    "))  # 循环写入各个章节
                f.write(r[3])
        cursor.close()
        conn.close()
    
    
    if __name__ == '__main__':  # 程序入口
        global category, save_path, headers, count, dbname, delay
        category = "笔趣阁"
        save_path = os.getcwd() + '/down/' + category
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"}
    
        dbname = "笔趣阁小说_" + time.strftime("%Y-%m-%d", time.localtime()) + ".sqlite"
        count = 0
        delay = 20  # 休息时间,单位秒
        fun_makedir(save_path)  # 创建文件夹
        # main()  # 执行主函数
        show_books()
    
    
  • 相关阅读:
    js_未结束的字符串常量
    [转]关于项目管理的思考
    Nhibernate理解
    Visual Studio 2005常用插件搜罗
    基本概念
    resharper 2.0
    Nhibernate资源
    [转]关于项目管理的知识点
    style
    带分数 蓝桥杯
  • 原文地址:https://www.cnblogs.com/yuexiao/p/12823899.html
Copyright © 2011-2022 走看看