zoukankan      html  css  js  c++  java
  • 博客园备份python程序

    一、获取博客园url清单

    1、sql表

    新建cbs数据库

    通过article_list.sql新建sql表:

    SET FOREIGN_KEY_CHECKS=0;
    
    -- ----------------------------
    -- Table structure for article_list
    -- ----------------------------
    DROP TABLE IF EXISTS `article_list`;
    CREATE TABLE `article_list` (
      `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
      `title` varchar(255) DEFAULT NULL,
      `countView` varchar(10) DEFAULT NULL,
      `countComment` varchar(10) DEFAULT NULL,
      `url` varchar(100) DEFAULT NULL,
      `datePublished` datetime DEFAULT NULL,
      `dateUpdated` datetime DEFAULT NULL,
      PRIMARY KEY (`id`)
    ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
    

      

    2、cookie

    登录博客园,此时的cookie值,存入mycookie.txt

    3、获取url清单

    python依赖包

    requirements.txt

    certifi==2020.4.5.1
    chardet==3.0.4
    et-xmlfile==1.0.1
    idna==2.9
    jdcal==1.4.1
    openpyxl==3.0.3
    PyMySQL==0.9.3
    requests==2.23.0
    urllib3==1.25.9
    

      

    Python程序

    import time
    import json
    import math
    import requests
    import sys
    import os
    
    from openpyxl import Workbook
    import pymysql
    
    # 启动前输出
    version_str = "V1.2"
    logo_str = "cnBlogs_List"
    logo_pic_str = """
                ____  _                           _     _     _  
      ___ _ __ | __ )| | ___   __ _ ___          | |   (_)___| |_
     / __| '_ |  _ | |/ _  / _` / __|         | |   | / __| __|
    | (__| | | | |_) | | (_) | (_| \__          | |___| \__  |_
     \___|_| |_|____/|_|\___/ \__, |___/  _____  |_____|_|___/\__|
                              |___/      |_____|                 
    """
    print("%s %s" % (logo_str, version_str), end='')
    print(logo_pic_str)
    print("%s %s 启动中..." % (logo_str, version_str))
    time.sleep(2.5)
    
    # 开始启动
    # 配置
    # 方式一:直接赋值给常量COOKIE_STR
    # cookie值
    COOKIE_STR = ""
    # 方式二:将cookie值存到一个指定的文件
    COOKIE_PATH = "./mycookie.txt"
    if os.path.exists(COOKIE_PATH) and len(COOKIE_STR) == 0:
        # 如果存在cookie存值文件,且没有对系统常量cookie赋值,则读取文件中的内容
        with open(COOKIE_PATH, 'r', encoding="utf-8") as f:
            data_str = f.read()
            if len(data_str) > 0:
                # 如果存在文件内容,则向系统赋值cookie常量
                COOKIE_STR = data_str
            else:
                print("There is no cookie value in the file %s" % COOKIE_PATH)
                sys.exit()
    # sys.exit()
    CACHE_FILE_NAME = "cache.txt"  # 缓存文件名称
    EXCEL_FILE_NAME = "result.xls"  # 电子表格文件名称
    WORKSHEET_NAME = "cnblogs_admin"  # 工作簿名称
    # TABLE_HEAD_TITLE_LIST = ["title", "countView", "countCommet", "url", "datePublished", "dateUpdated"]  # 表头名称
    TABLE_HEAD_TITLE_LIST = ["标题", "阅读量", "评论数", "链接", "首次发布时间", "最近更新时间"]  # *.xls或cache.txt表头名称
    SINGLE_PAGE_COUNT = 10  # 每页文章数目
    
    # 配置请求参数
    #REQUEST_URL_PART = r'https://i-beta.cnblogs.com/api/posts/list?p=%s&cid=&t=1&cfg=0'
    REQUEST_URL_PART = r'https://i.cnblogs.com/api/posts/list?p=%s&cid=&t=1&cfg=0'
    REQUEST_HEADERS = {
        'authority': 'i-beta.cnblogs.com',
        'method': 'GET',
        'path': '/api/posts/list?p=1&cid=&t=1&cfg=0',
        'scheme': 'https',
        'accept': 'application/json, text/plain, */*',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'cookie': COOKIE_STR,
        'referer': 'https://i-beta.cnblogs.com/posts',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-origin',
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
    }
    
    # 全局变量
    info_result_all = []  # 所有页面的数据汇总列表
    
    # 配置mysql数据库连接参数
    MYSQL_HOST = 'localhost'
    MYSQL_PORT = 3306
    MYSQL_USER = 'root'
    MYSQL_PASSWD = 'root'
    MYSQL_DB = 'cbs'
    MYSQL_CHARSET = 'utf8'
    
    # 连接数据库
    connect = pymysql.Connect(
        host=MYSQL_HOST,
        port=MYSQL_PORT,
        user=MYSQL_USER,
        passwd=MYSQL_PASSWD,
        db=MYSQL_DB,
        charset=MYSQL_CHARSET
    )
    # 获取游标
    cursor = connect.cursor()
    
    
    # 定义函数
    # 获取总页数
    def get_page_count(page_n=1):
        page_num = page_n
        # 构造请求参数
        url = REQUEST_URL_PART % page_num
        headers = REQUEST_HEADERS
        # 发起get请求
        response = requests.get(url, headers=headers, verify=True)
        html_data = response.content.decode()
        data_dict = json.loads(html_data)
        all_post_count = data_dict["postsCount"]  # 获取总文章数目
        page_count = math.ceil(all_post_count / 10)  # 获取总页数
        # 返回总页数
        return page_count
    
    
    # 请求每个列表页的数据
    # 1、打印到控制台
    # 2、存入mysql数据库cbs中article_list表中
    def get_per_page_data(page_n=1, save_mysql=True):
        page_num = page_n
    
        url = REQUEST_URL_PART % page_num
        headers = REQUEST_HEADERS
        # 发起get请求
        response = requests.get(url, headers=headers, verify=True)
        html_data = response.content.decode()
        data_dict = json.loads(html_data)
        post_list = data_dict["postList"]  # 获取页面内容
        all_post_count = data_dict["postsCount"]  # 获取总文章数目
        page_count = math.ceil(all_post_count / 10)  # 获取总页数
    
        print("【status %s】第 %s 页" % (response.status_code, page_n))
    
        info_result = []  # 写入表格的数据
        # 写入首行
        if page_num == 1:
            info = TABLE_HEAD_TITLE_LIST  # 首行内容
            info_result.append(info)
    
        # 获取当页内容
        for index, item in enumerate(post_list):
            title = item['title']  # 获取标题
            viewCount = item['viewCount']  # 获取点击量
            comment_count = item['feedBackCount']  # 获取评论数量
            url = "https:%s" % item['url']  # 获取文章url
            datePublished = item['datePublished']  # 获取首次发布日期
            dateUpdated = item['dateUpdated']  # 获取最近修改日期
            # 打印到控制台
            print((index + ((page_n - 1) * SINGLE_PAGE_COUNT) + 1), end='  ')
            print(title)
    
            info = [title, viewCount, comment_count, url, datePublished, dateUpdated]  # 每行的内容
            # 存内存列表
            info_result.append(info)
    
            if save_mysql is True:
                # 存mysql数据库
                # 增加数据操作
                sql_1 = "INSERT INTO article_list(title,countView,countComment,url,datePublished,dateUpdated) VALUES ('%s','%s','%s','%s','%s','%s');"
                data = tuple(info)
                cursor.execute(sql_1 % data)  # 生成增加sql语句
                # cursor.execute(sql_1)  # 生成增加sql语句
                connect.commit()  # 确认永久执行增加
    
        return page_count, page_num, info_result
    
    
    # 变量列表页,数据存mysql、txt
    def get_cbs_list_data(page_count, save_cache=True):
        for n in range(1, page_count + 1):
            time.sleep(1)  # 休息一秒钟,降低请求频率
            one_page_data_list = get_per_page_data(n, save_mysql=True)  # 请求一次第n页,并存mysql数据库
            page_num = one_page_data_list[1]
            info_result = one_page_data_list[2]
    
            info_result_all.extend(info_result)
    
            if save_cache is True:
                # 将本页面数据写入缓存文件cache.txt
                save_file_cache = CACHE_FILE_NAME
                with open(save_file_cache, 'a', encoding="utf-8") as f:
                    f.write("## 第 %s/%s 页
    " % (page_num, page_count))  # 缓存页面号
                    for line in info_result:
                        f.write("%s	%s	%s	%s	%s	%s
    " % tuple(line))  # 本页内容写入缓存
    
    
    # 将数据写入excel
    def save_excel():
        save_file = EXCEL_FILE_NAME  # 存入文件名称
        sheet_name = WORKSHEET_NAME  # 工作簿名称
        wb = Workbook()  # 新建工作簿
        ws1 = wb.active  # 获得当前活跃的工作页,默认为第一个工作页
        ws1.title = sheet_name  # 修改页名称。sheet名称
        for row in info_result_all:
            ws1.append(row)
        wb.save(save_file)  # Excel文件名称,保存文件
    
    
    # 主函数
    # 1、遍历所有页面
    # 2、存入缓存文件cache.txt、存入mysql数据库
    # 3、数据存入内存
    # 4、存入xls电子表格中
    def main():
        # 尝试请求第一个列表页,获取总页数
        try:
            page_count = int(get_page_count())  # 获取总页面数。
        except Exception:
            print("请求无效,请替换为新Cookie值(COOKIE_STR)后重试")
            sys.exit()
    
        # 遍历所有页面。存mysql,存cache.txt文件
        get_cbs_list_data(page_count, save_cache=False)
        # get_cbs_list_data(2, save_cache=True)  # 遍历前2页
    
        # 最后:将数据写入excel
        # save_excel()
    
    
    # 启动测试
    if __name__ == '__main__':
        # 启动主程序
        main()
    

      

    二、通过url清单下载html页面

    1、新建如下目录结构

    └─html
        ├─2
        ├─3
        ├─4
        └─others
    

      

    2、新建url清单文件,并从sql库中导入url清单

    1.txt

    3、下载html文件

    import os
    import time
    import requests
    import shutil
    from contextlib import closing
    from urllib.request import urlopen
    
    
    # py2使用
    # from urllib2 import urlopen
    
    
    def myurlopen(url):
        # url = 'https://www.baidu.com'
        headers = {
            'authority': 'i-beta.cnblogs.com',
            'method': 'GET',
            # 'path': '/api/posts/list?p=1&cid=&t=1&cfg=0',
            'scheme': 'https',
            'accept': 'application/json, text/plain, */*',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
            # 'cookie': COOKIE_STR,
            'referer': 'https://i-beta.cnblogs.com/posts',
            'sec-fetch-dest': 'empty',
            'sec-fetch-mode': 'cors',
            'sec-fetch-site': 'same-origin',
            'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
        }
    
        # 发起get请求
        response = requests.get(url, headers=headers, verify=True)
        # 获取html文本
        # html_data = response.content.decode()
        return response
    
    
    def save_html(url):
        # url = "https://www.cnblogs.com/andy9468/p/10005406.html"
        # url = "https://www.cnblogs.com/andy9468/p/8025420.html"
        server_path, html_name = os.path.split(url)
        print(server_path)
        print(html_name)
    
        # with closing(urlopen('ftp://www.xxxx.com/haha.txt')) as page:
        resp = myurlopen(url)
        html_code = resp.status_code
    
        # print(html_code)
        html_data = resp.content
        if str(html_code).startswith('2'):
            with open("html/2/%s" % html_name, 'wb') as f:
                f.write(html_data)
        elif str(html_code).startswith('3'):
            with open("html/3/%s" % html_name, 'wb') as f:
                f.write(html_data)
        elif str(html_code).startswith('4'):
            with open("html/4/%s" % html_name, 'wb') as f:
                f.write(html_data)
        else:
            with open("html/others/%s" % html_name, 'wb') as f:
                f.write(html_data)
    
    
    def main():
        # url = "https://www.cnblogs.com/andy9468/p/10005406.html"
        # save_html(url)
    
        urls_file = "1.txt"
        with open(urls_file, 'r')as f:
            urls = f.readlines()
            for url in urls:
                # print(123)
                # print(url.strip("
    "))
                url = url.strip("
    ")
                time.sleep(1)
                save_html(url)
    
    
    if __name__ == '__main__':
        main()
    

      

  • 相关阅读:
    ACE反应器(Reactor)模式(1)
    net 命令
    ACE反应器(Reactor)模式(2)
    恢复SQLServer实例连接 (转载)
    在SQL Server Management Studio中可以运行作业但是用TSQL运行则失败 (转载)
    VirtualBox 之 共享磁盘
    从 IClassFactory 为 CLSID 为 {0002450000000000C000000000000046} 的 COM 组件创建实例失败,原因是出现以下错误: 8001010a解决办法 .
    SQLServer数据类型优先级对性能的影响 (转)
    封装getElementsByAttribute
    js中setAttribute 的兼容性
  • 原文地址:https://www.cnblogs.com/andy9468/p/14298797.html
Copyright © 2011-2022 走看看