zoukankan html css js c++ java

python-爬虫-史书典籍

import requests
import os
from lxml import html
import time


def get_title_url(tree):
    '''一级  获取标题'''
    # 史书典籍
    # 格式：/book/sanguoyanyi.html
    History_book_url_list = tree.xpath("//div[@class='index-li'][3]/ul/li/a/@href")
    # 格式：三国演义
    History_book_name_list = tree.xpath("//div[@class='index-li'][3]/ul/li/a/text()")
    return History_book_url_list,History_book_name_list


def get_article_url(tree):
    '''二级  获取文章标题'''
    # 三国演义典籍
    # 格式：/book/sanguoyanyi/1.html
    book_url_list = tree.xpath("//div[@class='book-mulu']/ul/li/a/@href")
    # 格式：第一回·宴桃园豪杰三结义  斩黄巾英雄首立功
    book_name_list = tree.xpath("//div[@class='book-mulu']/ul/li/a/text()")
    return book_url_list,book_name_list


def get_article(tree):
    '''三级  获取文章内容'''
    # 第一回·宴桃园豪杰三结义  斩黄巾英雄首立功
    # 格式：/book/sanguoyanyi/1.html
    article_list = tree.xpath("//div[@class='chapter_content']/p/text()")
    return ''.join(article_list)

def get_request(url,headers):
    '''获取页面'''
    response = requests.get(url=url,headers=headers)
    tree = html.fromstring(response.text)
    return tree

def save_mkdir(two):
    '''三级  保存文章夹'''
    # 一级文件夹
    if os.path.exists('史书典籍'):
        pass
    else:
        os.mkdir('史书典籍')
    # 二级文件夹
    if os.path.exists('史书典籍/'+ two):
        pass
    else:
        os.mkdir('史书典籍/'+ two)

def police_2(a):
    '''二级中断检测'''
    b = None
    if os.path.exists('史书典籍/police_2.txt'):
        with open('史书典籍/police_2.txt', 'r') as f:
            b = f.read()
            f.close()
            if b is None:
                return True
            elif b is '':
                return True
            if a < int(b):
                return False
    # 写入并返回True
    with open('史书典籍/police_2.txt', 'w') as f:
        f.write(str(a))
        f.close()
        return True



def police_3(a):
    '''三级中断检测'''
    b = None
    if os.path.exists('史书典籍/police_3.txt'):
        with open('史书典籍/police_3.txt', 'r') as f:
            b = f.read()
            f.close()
            if b is None:
                return True
            elif b is '':
                return True
            if a < int(b):
                return False
    # 写入并返回True
    with open('史书典籍/police_3.txt', 'w') as f:
        f.write(str(a))
        f.close()
        return True


def main():
    '''主函数'''
    # 根路由
    root = 'http://www.shicimingju.com'
    # 头部
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'
    }


    # 获取root页面
    tree1 = get_request(root,headers)
    # 获取一级名字和路由
    History_book_url_list, History_book_name_list = get_title_url(tree1)
    # 获取二级页面
    for i in range(len(History_book_url_list)):
        if police_2(i) is False:
            continue
        # 二级路由
        url2 = root + History_book_url_list[i]
        print("爬取>>>"+History_book_name_list[i]+'开始')
        tree2 = get_request(url2,headers)
        # 获取二级名字和路由
        book_url_list,book_name_list = get_article_url(tree2)
        # 文章夹保存
        save_mkdir(History_book_name_list[i])
        # 下载文章
        for j in range(len(book_url_list)):
            if police_3(j) is False:
                continue
            time.sleep(1)
            # 三级路由
            url3 = root + book_url_list[j]
            print("爬取:" + book_name_list[j])
            # 文章
            tree3 = get_request(url3, headers)
            txt = get_article(tree3)
            # 文章标题
            txt_name = book_name_list[j]
            # 文章保存
            file_path = '史书典籍/{}/{}.txt'.format(History_book_name_list[i],(txt_name.replace(' ','')).replace('·',''))
            with open(file_path,'w',encoding='utf-8') as f:
                f.write(txt)
                f.close()
        print("爬取>>>" + History_book_name_list[i] + '结束')



if __name__ == '__main__':
    main()

查看全文

相关阅读:
JQuery实现页面跳转
 CSS中让背景图片居中且不平铺
 C#后台将string="23.00"转换成int类型
 BootStrap的一些基本语法
 CSS实现文字阴影的效果
 BootStrap自定义轮播图播放速度
 BootStrap 轮播插件(carousel)支持左右手势滑动的方法(三种)
C#常用快捷键
 jQuery hover() 方法
 鼠标移动有尾巴

原文地址：https://www.cnblogs.com/person1-0-1/p/11316076.html