zoukankan      html  css  js  c++  java
  • 博客园备份python程序

    一、获取博客园url清单

    1、sql表

    新建cbs数据库

    通过article_list.sql新建sql表:

    SET FOREIGN_KEY_CHECKS=0;
    
    -- ----------------------------
    -- Table structure for article_list
    -- ----------------------------
    DROP TABLE IF EXISTS `article_list`;
    CREATE TABLE `article_list` (
      `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
      `title` varchar(255) DEFAULT NULL,
      `countView` varchar(10) DEFAULT NULL,
      `countComment` varchar(10) DEFAULT NULL,
      `url` varchar(100) DEFAULT NULL,
      `datePublished` datetime DEFAULT NULL,
      `dateUpdated` datetime DEFAULT NULL,
      PRIMARY KEY (`id`)
    ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
    

      

    2、cookie

    登录博客园,此时的cookie值,存入mycookie.txt

    3、获取url清单

    python依赖包

    requirements.txt

    certifi==2020.4.5.1
    chardet==3.0.4
    et-xmlfile==1.0.1
    idna==2.9
    jdcal==1.4.1
    openpyxl==3.0.3
    PyMySQL==0.9.3
    requests==2.23.0
    urllib3==1.25.9
    

      

    Python程序

    import time
    import json
    import math
    import requests
    import sys
    import os
    
    from openpyxl import Workbook
    import pymysql
    
    # 启动前输出
    version_str = "V1.2"
    logo_str = "cnBlogs_List"
    logo_pic_str = """
                ____  _                           _     _     _  
      ___ _ __ | __ )| | ___   __ _ ___          | |   (_)___| |_
     / __| '_ |  _ | |/ _  / _` / __|         | |   | / __| __|
    | (__| | | | |_) | | (_) | (_| \__          | |___| \__  |_
     \___|_| |_|____/|_|\___/ \__, |___/  _____  |_____|_|___/\__|
                              |___/      |_____|                 
    """
    print("%s %s" % (logo_str, version_str), end='')
    print(logo_pic_str)
    print("%s %s 启动中..." % (logo_str, version_str))
    time.sleep(2.5)
    
    # 开始启动
    # 配置
    # 方式一:直接赋值给常量COOKIE_STR
    # cookie值
    COOKIE_STR = ""
    # 方式二:将cookie值存到一个指定的文件
    COOKIE_PATH = "./mycookie.txt"
    if os.path.exists(COOKIE_PATH) and len(COOKIE_STR) == 0:
        # 如果存在cookie存值文件,且没有对系统常量cookie赋值,则读取文件中的内容
        with open(COOKIE_PATH, 'r', encoding="utf-8") as f:
            data_str = f.read()
            if len(data_str) > 0:
                # 如果存在文件内容,则向系统赋值cookie常量
                COOKIE_STR = data_str
            else:
                print("There is no cookie value in the file %s" % COOKIE_PATH)
                sys.exit()
    # sys.exit()
    CACHE_FILE_NAME = "cache.txt"  # 缓存文件名称
    EXCEL_FILE_NAME = "result.xls"  # 电子表格文件名称
    WORKSHEET_NAME = "cnblogs_admin"  # 工作簿名称
    # TABLE_HEAD_TITLE_LIST = ["title", "countView", "countCommet", "url", "datePublished", "dateUpdated"]  # 表头名称
    TABLE_HEAD_TITLE_LIST = ["标题", "阅读量", "评论数", "链接", "首次发布时间", "最近更新时间"]  # *.xls或cache.txt表头名称
    SINGLE_PAGE_COUNT = 10  # 每页文章数目
    
    # 配置请求参数
    #REQUEST_URL_PART = r'https://i-beta.cnblogs.com/api/posts/list?p=%s&cid=&t=1&cfg=0'
    REQUEST_URL_PART = r'https://i.cnblogs.com/api/posts/list?p=%s&cid=&t=1&cfg=0'
    REQUEST_HEADERS = {
        'authority': 'i-beta.cnblogs.com',
        'method': 'GET',
        'path': '/api/posts/list?p=1&cid=&t=1&cfg=0',
        'scheme': 'https',
        'accept': 'application/json, text/plain, */*',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'cookie': COOKIE_STR,
        'referer': 'https://i-beta.cnblogs.com/posts',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-origin',
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
    }
    
    # 全局变量
    info_result_all = []  # 所有页面的数据汇总列表
    
    # 配置mysql数据库连接参数
    MYSQL_HOST = 'localhost'
    MYSQL_PORT = 3306
    MYSQL_USER = 'root'
    MYSQL_PASSWD = 'root'
    MYSQL_DB = 'cbs'
    MYSQL_CHARSET = 'utf8'
    
    # 连接数据库
    connect = pymysql.Connect(
        host=MYSQL_HOST,
        port=MYSQL_PORT,
        user=MYSQL_USER,
        passwd=MYSQL_PASSWD,
        db=MYSQL_DB,
        charset=MYSQL_CHARSET
    )
    # 获取游标
    cursor = connect.cursor()
    
    
    # 定义函数
    # 获取总页数
    def get_page_count(page_n=1):
        page_num = page_n
        # 构造请求参数
        url = REQUEST_URL_PART % page_num
        headers = REQUEST_HEADERS
        # 发起get请求
        response = requests.get(url, headers=headers, verify=True)
        html_data = response.content.decode()
        data_dict = json.loads(html_data)
        all_post_count = data_dict["postsCount"]  # 获取总文章数目
        page_count = math.ceil(all_post_count / 10)  # 获取总页数
        # 返回总页数
        return page_count
    
    
    # 请求每个列表页的数据
    # 1、打印到控制台
    # 2、存入mysql数据库cbs中article_list表中
    def get_per_page_data(page_n=1, save_mysql=True):
        page_num = page_n
    
        url = REQUEST_URL_PART % page_num
        headers = REQUEST_HEADERS
        # 发起get请求
        response = requests.get(url, headers=headers, verify=True)
        html_data = response.content.decode()
        data_dict = json.loads(html_data)
        post_list = data_dict["postList"]  # 获取页面内容
        all_post_count = data_dict["postsCount"]  # 获取总文章数目
        page_count = math.ceil(all_post_count / 10)  # 获取总页数
    
        print("【status %s】第 %s 页" % (response.status_code, page_n))
    
        info_result = []  # 写入表格的数据
        # 写入首行
        if page_num == 1:
            info = TABLE_HEAD_TITLE_LIST  # 首行内容
            info_result.append(info)
    
        # 获取当页内容
        for index, item in enumerate(post_list):
            title = item['title']  # 获取标题
            viewCount = item['viewCount']  # 获取点击量
            comment_count = item['feedBackCount']  # 获取评论数量
            url = "https:%s" % item['url']  # 获取文章url
            datePublished = item['datePublished']  # 获取首次发布日期
            dateUpdated = item['dateUpdated']  # 获取最近修改日期
            # 打印到控制台
            print((index + ((page_n - 1) * SINGLE_PAGE_COUNT) + 1), end='  ')
            print(title)
    
            info = [title, viewCount, comment_count, url, datePublished, dateUpdated]  # 每行的内容
            # 存内存列表
            info_result.append(info)
    
            if save_mysql is True:
                # 存mysql数据库
                # 增加数据操作
                sql_1 = "INSERT INTO article_list(title,countView,countComment,url,datePublished,dateUpdated) VALUES ('%s','%s','%s','%s','%s','%s');"
                data = tuple(info)
                cursor.execute(sql_1 % data)  # 生成增加sql语句
                # cursor.execute(sql_1)  # 生成增加sql语句
                connect.commit()  # 确认永久执行增加
    
        return page_count, page_num, info_result
    
    
    # 变量列表页,数据存mysql、txt
    def get_cbs_list_data(page_count, save_cache=True):
        for n in range(1, page_count + 1):
            time.sleep(1)  # 休息一秒钟,降低请求频率
            one_page_data_list = get_per_page_data(n, save_mysql=True)  # 请求一次第n页,并存mysql数据库
            page_num = one_page_data_list[1]
            info_result = one_page_data_list[2]
    
            info_result_all.extend(info_result)
    
            if save_cache is True:
                # 将本页面数据写入缓存文件cache.txt
                save_file_cache = CACHE_FILE_NAME
                with open(save_file_cache, 'a', encoding="utf-8") as f:
                    f.write("## 第 %s/%s 页
    " % (page_num, page_count))  # 缓存页面号
                    for line in info_result:
                        f.write("%s	%s	%s	%s	%s	%s
    " % tuple(line))  # 本页内容写入缓存
    
    
    # 将数据写入excel
    def save_excel():
        save_file = EXCEL_FILE_NAME  # 存入文件名称
        sheet_name = WORKSHEET_NAME  # 工作簿名称
        wb = Workbook()  # 新建工作簿
        ws1 = wb.active  # 获得当前活跃的工作页,默认为第一个工作页
        ws1.title = sheet_name  # 修改页名称。sheet名称
        for row in info_result_all:
            ws1.append(row)
        wb.save(save_file)  # Excel文件名称,保存文件
    
    
    # 主函数
    # 1、遍历所有页面
    # 2、存入缓存文件cache.txt、存入mysql数据库
    # 3、数据存入内存
    # 4、存入xls电子表格中
    def main():
        # 尝试请求第一个列表页,获取总页数
        try:
            page_count = int(get_page_count())  # 获取总页面数。
        except Exception:
            print("请求无效,请替换为新Cookie值(COOKIE_STR)后重试")
            sys.exit()
    
        # 遍历所有页面。存mysql,存cache.txt文件
        get_cbs_list_data(page_count, save_cache=False)
        # get_cbs_list_data(2, save_cache=True)  # 遍历前2页
    
        # 最后:将数据写入excel
        # save_excel()
    
    
    # 启动测试
    if __name__ == '__main__':
        # 启动主程序
        main()
    

      

    二、通过url清单下载html页面

    1、新建如下目录结构

    └─html
        ├─2
        ├─3
        ├─4
        └─others
    

      

    2、新建url清单文件,并从sql库中导入url清单

    1.txt

    3、下载html文件

    import os
    import time
    import requests
    import shutil
    from contextlib import closing
    from urllib.request import urlopen
    
    
    # py2使用
    # from urllib2 import urlopen
    
    
    def myurlopen(url):
        # url = 'https://www.baidu.com'
        headers = {
            'authority': 'i-beta.cnblogs.com',
            'method': 'GET',
            # 'path': '/api/posts/list?p=1&cid=&t=1&cfg=0',
            'scheme': 'https',
            'accept': 'application/json, text/plain, */*',
            'accept-encoding': 'gzip, deflate, br',
            'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
            # 'cookie': COOKIE_STR,
            'referer': 'https://i-beta.cnblogs.com/posts',
            'sec-fetch-dest': 'empty',
            'sec-fetch-mode': 'cors',
            'sec-fetch-site': 'same-origin',
            'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
        }
    
        # 发起get请求
        response = requests.get(url, headers=headers, verify=True)
        # 获取html文本
        # html_data = response.content.decode()
        return response
    
    
    def save_html(url):
        # url = "https://www.cnblogs.com/andy9468/p/10005406.html"
        # url = "https://www.cnblogs.com/andy9468/p/8025420.html"
        server_path, html_name = os.path.split(url)
        print(server_path)
        print(html_name)
    
        # with closing(urlopen('ftp://www.xxxx.com/haha.txt')) as page:
        resp = myurlopen(url)
        html_code = resp.status_code
    
        # print(html_code)
        html_data = resp.content
        if str(html_code).startswith('2'):
            with open("html/2/%s" % html_name, 'wb') as f:
                f.write(html_data)
        elif str(html_code).startswith('3'):
            with open("html/3/%s" % html_name, 'wb') as f:
                f.write(html_data)
        elif str(html_code).startswith('4'):
            with open("html/4/%s" % html_name, 'wb') as f:
                f.write(html_data)
        else:
            with open("html/others/%s" % html_name, 'wb') as f:
                f.write(html_data)
    
    
    def main():
        # url = "https://www.cnblogs.com/andy9468/p/10005406.html"
        # save_html(url)
    
        urls_file = "1.txt"
        with open(urls_file, 'r')as f:
            urls = f.readlines()
            for url in urls:
                # print(123)
                # print(url.strip("
    "))
                url = url.strip("
    ")
                time.sleep(1)
                save_html(url)
    
    
    if __name__ == '__main__':
        main()
    

      

  • 相关阅读:
    POJ 1141 括号匹配 DP
    881. Boats to Save People
    870. Advantage Shuffle
    874. Walking Robot Simulation
    文件操作
    861. Score After Flipping Matrix
    860. Lemonade Change
    842. Split Array into Fibonacci Sequence
    765. Couples Holding Hands
    763. Partition Labels
  • 原文地址:https://www.cnblogs.com/andy9468/p/14298797.html
Copyright © 2011-2022 走看看