zoukankan      html  css  js  c++  java
  • python-爬虫-史书典籍

    import requests
    import os
    from lxml import html
    import time
    
    
    def get_title_url(tree):
        '''一级  获取标题'''
        # 史书典籍
        # 格式:/book/sanguoyanyi.html
        History_book_url_list = tree.xpath("//div[@class='index-li'][3]/ul/li/a/@href")
        # 格式:三国演义
        History_book_name_list = tree.xpath("//div[@class='index-li'][3]/ul/li/a/text()")
        return History_book_url_list,History_book_name_list
    
    
    def get_article_url(tree):
        '''二级  获取文章标题'''
        # 三国演义典籍
        # 格式:/book/sanguoyanyi/1.html
        book_url_list = tree.xpath("//div[@class='book-mulu']/ul/li/a/@href")
        # 格式:第一回·宴桃园豪杰三结义  斩黄巾英雄首立功
        book_name_list = tree.xpath("//div[@class='book-mulu']/ul/li/a/text()")
        return book_url_list,book_name_list
    
    
    def get_article(tree):
        '''三级  获取文章内容'''
        # 第一回·宴桃园豪杰三结义  斩黄巾英雄首立功
        # 格式:/book/sanguoyanyi/1.html
        article_list = tree.xpath("//div[@class='chapter_content']/p/text()")
        return ''.join(article_list)
    
    def get_request(url,headers):
        '''获取页面'''
        response = requests.get(url=url,headers=headers)
        tree = html.fromstring(response.text)
        return tree
    
    def save_mkdir(two):
        '''三级  保存文章夹'''
        # 一级文件夹
        if os.path.exists('史书典籍'):
            pass
        else:
            os.mkdir('史书典籍')
        # 二级文件夹
        if os.path.exists('史书典籍/'+ two):
            pass
        else:
            os.mkdir('史书典籍/'+ two)
    
    def police_2(a):
        '''二级中断检测'''
        b = None
        if os.path.exists('史书典籍/police_2.txt'):
            with open('史书典籍/police_2.txt', 'r') as f:
                b = f.read()
                f.close()
                if b is None:
                    return True
                elif b is '':
                    return True
                if a < int(b):
                    return False
        # 写入并返回True
        with open('史书典籍/police_2.txt', 'w') as f:
            f.write(str(a))
            f.close()
            return True
    
    
    
    def police_3(a):
        '''三级中断检测'''
        b = None
        if os.path.exists('史书典籍/police_3.txt'):
            with open('史书典籍/police_3.txt', 'r') as f:
                b = f.read()
                f.close()
                if b is None:
                    return True
                elif b is '':
                    return True
                if a < int(b):
                    return False
        # 写入并返回True
        with open('史书典籍/police_3.txt', 'w') as f:
            f.write(str(a))
            f.close()
            return True
    
    
    def main():
        '''主函数'''
        # 根路由
        root = 'http://www.shicimingju.com'
        # 头部
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'
        }
    
    
        # 获取root页面
        tree1 = get_request(root,headers)
        # 获取一级名字和路由
        History_book_url_list, History_book_name_list = get_title_url(tree1)
        # 获取二级页面
        for i in range(len(History_book_url_list)):
            if police_2(i) is False:
                continue
            # 二级路由
            url2 = root + History_book_url_list[i]
            print("爬取>>>"+History_book_name_list[i]+'开始')
            tree2 = get_request(url2,headers)
            # 获取二级名字和路由
            book_url_list,book_name_list = get_article_url(tree2)
            # 文章夹保存
            save_mkdir(History_book_name_list[i])
            # 下载文章
            for j in range(len(book_url_list)):
                if police_3(j) is False:
                    continue
                time.sleep(1)
                # 三级路由
                url3 = root + book_url_list[j]
                print("爬取:" + book_name_list[j])
                # 文章
                tree3 = get_request(url3, headers)
                txt = get_article(tree3)
                # 文章标题
                txt_name = book_name_list[j]
                # 文章保存
                file_path = '史书典籍/{}/{}.txt'.format(History_book_name_list[i],(txt_name.replace(' ','')).replace('·',''))
                with open(file_path,'w',encoding='utf-8') as f:
                    f.write(txt)
                    f.close()
            print("爬取>>>" + History_book_name_list[i] + '结束')
    
    
    
    if __name__ == '__main__':
        main()
  • 相关阅读:
    Leetcode 238. Product of Array Except Self
    来博客园的第一天
    [LeetCode] 1020. Number of Enclaves
    [LeetCode] 921. Minimum Add to Make Parentheses Valid
    [LeetCode] 1541. Minimum Insertions to Balance a Parentheses String
    [LeetCode] 738. Monotone Increasing Digits
    [LeetCode] 1669. Merge In Between Linked Lists
    [LeetCode] 865. Smallest Subtree with all the Deepest Nodes
    [LeetCode] 376. Wiggle Subsequence
    [LeetCode] 1170. Compare Strings by Frequency of the Smallest Character
  • 原文地址:https://www.cnblogs.com/person1-0-1/p/11316076.html
Copyright © 2011-2022 走看看