昨晚柑橘博客园“抖动”了,访问好像有点问题。突然陷入对博客平台突然访问不了的恐惧之中。好吧,来战,备份总是需要的。不多说了,上代码,直接一条命令备份:
1 #!/usr/bin/env python 2 # -*- coding:utf-8 -*- 3 4 #脚本信息: 5 """ 6 作者:陈然 7 QQ :1914518025 8 版本:博客园随笔备份系统v1.01 9 修改: 10 1、为了支持百度云的上传,将文件名中部分特殊符号替换了 11 """ 12 13 #引入包文件 14 import re 15 import sys 16 import logging 17 import requests 18 19 #设置全局设置 20 logging.basicConfig(format="%(message)s",level=logging.INFO) 21 22 #定义全局变量 23 Particle_Urls_List = [] #全局文章对应的URL列表 24 BlogName = "你的名字" #博客名,这个在地址中标明唯一博客归属主人 25 26 #全局函数定义 27 def particle_backup(url): 28 '''备份单篇文章函数''' 29 headers = {"User-Agent":"Mozilla/5.0 (iPhone; U; CPU like Mac OS X) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/4A93 Safari/419.3"} 30 try: 31 response = requests.get(url,headers=headers,timeout=3) 32 except Exception,ex: 33 log_string = "[-]访问%s失败,错误原因:%s"%(str(url),str(ex)) 34 logging.error(log_string) 35 return -1 36 if response.status_code == 200: 37 #获取文章标题作为保存的HTML文件的名字(为了支持上传百度云,将标题中的特殊符号替换了!) 38 title = response.content.split("</title>")[0].split("<title>")[-1].replace(" ","").replace("?","问号").replace("?","问号").replace("#","井号") 39 logging.info("[+]开始备份:%s"%str(title)) 40 fd = open("./backup/%s.html"%title,"w")#保存在所在目录下的backup文件夹下 41 fd.write(response.content) 42 fd.close() 43 logging.info("[+]备份:%s成功"%str(title)) 44 return 0 45 else: 46 return -1 47 48 def get_particle_url(url=None): 49 '''获取所有文章URL函数''' 50 global BlogName 51 global Particle_Urls_List 52 headers = {"User-Agent":"Mozilla/5.0 (iPhone; U; CPU like Mac OS X) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/4A93 Safari/419.3"} 53 if url == None: 54 url = "http://www.cnblogs.com/%s/"%BlogName 55 try: 56 response = requests.get(url,headers=headers,timeout=5) 57 except Exception,ex: 58 log_string = "[-]访问%s失败,原因:%s"%(str(url),str(ex)) 59 logging.error(log_string) 60 return -1 61 if response.status_code == 200: 62 text = response.content 63 ret = re.findall(r"http://[^/]+?/%s/p/d+.html"%BlogName,str(text))#分析提取文件中的所有文章URL 64 Particle_Urls_List.extend(ret) 65 Particle_Urls_List = list(set(Particle_Urls_List))#去重 66 return text 67 else: 68 logging.error("访问%s失败,未知原因,返回码:%s"%(str(url),str(response.status_code))) 69 return -1 70 71 def start_buckup(): 72 '''开始备份主函数''' 73 page_size = 2 74 ret = get_particle_url() 75 logging.info("[+]开始分析第1页!") 76 while True: 77 if ret.find("http://www.cnblogs.com/%s/default.html?page="%BlogName) >= 0: 78 particle_url = "http://www.cnblogs.com/%s/default.html?page=%s"%(BlogName,str(page_size)) 79 ret = get_particle_url(url=particle_url) 80 logging.info("[+]开始分析第%s页"%str(page_size)) 81 page_size += 1 82 else: 83 break 84 85 #程序入口执行点 86 if __name__ == "__main__": 87 try: 88 if sys.argv[1] != None: 89 BlogName = sys.argv[1]#输入备份谁的文章 90 except Exception: 91 pass 92 start_buckup() 93 for url in Particle_Urls_List: 94 particle_backup(url) 95 logging.info("备份完成")