本章内容
1、rawlog处理
2、域名item接口刷新
3、备案结果查询
4、多级域名中取主域
5、发送邮件
6、通过api获取cdn edge ip
7、多线程下载
1、rawlog处理
脚本里面涉及的内容
1、使用多cpu处理
2、UTC与GMT-8时间的转换
3、一个目录下对子目录的文件遍历
下面有多个panther-*目录
4、gzip文件的读取处理
Parsing Per-Hit (PerHit) Log using Python3 (incl. Multi-Thread version)

#!/usr/bin/env python3 # coding: utf-8 import os import sys import traceback import re import gzip from datetime import datetime from dateutil import tz base_path = "/home/xuanjia/static.trthi.com" file_name_prefix = "F114BC2216604A2C93AF5F6821168CA5_" file_name_sufix = "_pca_cn_cas_001.log" def conv_date(input_date, input_hour): local_date = datetime.strptime(input_date + " " + input_hour, "%Y%m%d %H").replace(tzinfo=tz.gettz('UTC')).astimezone(tz.gettz('Asia/Shanghai')) re_date = [] re_date.append(local_date.strftime("%Y%m%d")) re_date.append(local_date.strftime("%H")) return re_date def main(): global base_path global file_name_prefix global file_name_sufix array_subdirs = [] array_hours = [] array_subdirs = os.listdir(base_path) for i in range(0, 24, 1): array_hours.append('{0:02}'.format(i)) for hour in array_hours: for subdir in array_subdirs: f_name = conv_date(subdir, hour) print(f_name) with open(base_path + "/" + file_name_prefix + f_name[0] + "_" + f_name[1] + file_name_sufix, 'w') as output_f: # output_lines = [] for root, dirs, files in os.walk(base_path + "/" + subdir): for file_name in files: if re.search(".*_upstream_.*", file_name) is None and re.search(".*_" + subdir + "_" + hour + "_.*", file_name) is not None: try: with gzip.open(os.path.join(root + "/" + file_name), 'rt', encoding='utf-8') as input_f: for input_line in input_f: array_line = input_line.split(' ') dict_line = {} dict_line["Event-Type"] = array_line[0] dict_line["Site-ID"] = array_line[1] dict_line["Date"] = array_line[2] dict_line["Time"] = array_line[3] dict_line["C-IP"] = array_line[4] dict_line["CS-UserName"] = array_line[5] dict_line["S-SiteName"] = array_line[6] dict_line["S-ComputerName"] = array_line[7] dict_line["S-IP"] = array_line[8] dict_line["S-Port"] = array_line[9] dict_line["CS-Method"] = array_line[10] dict_line["CS-URI"] = array_line[11] dict_line["CS-URI-Query"] = array_line[12] dict_line["SC-Status"] = array_line[13] dict_line["SC-Win32-Status"] = array_line[14] dict_line["SC-Bytes"] = array_line[15] dict_line["CS-Bytes"] = array_line[16] dict_line["Time-Taken"] = array_line[17] dict_line["CS-Version"] = array_line[18] dict_line["CS-Host"] = array_line[19] dict_line["CS-UserAgent"] = array_line[20] dict_line["CS-Cookie"] = array_line[21] dict_line["CS-Referer"] = array_line[22] dict_line["SC-Sub-Status"] = array_line[23] dict_line["CS-Range"] = array_line[24] dict_line["SC-Initial"] = array_line[25] dict_line["SC-Complete"] = array_line[26] dict_line["SC-ContentType"] = array_line[27] dict_line["Protocol"] = array_line[28] dict_line["SC-Bytes-Body"] = array_line[29] dict_line["Bytes-Origin-Uncompressed"] = array_line[30] dict_line["C-RemotePort"] = array_line[31] # print(dict_line) array_output = [] array_output.append(dict_line["C-IP"]) array_output.append("-") array_output.append("-") array_output.append("[" + datetime.strptime(dict_line["Date"] + " " + dict_line["Time"], "%Y-%m-%d %H:%M:%S").replace(tzinfo=tz.gettz('UTC')).astimezone(tz.gettz('Asia/Shanghai')).strftime("%d/%b/%Y:%H:%M:%S +08:00") + "]") array_output.append(""" + dict_line["CS-Method"]) array_output.append("http://" + dict_line["CS-Host"] + dict_line["CS-URI"] + "?" + dict_line["CS-URI-Query"]) array_output.append(dict_line["CS-Version"] + """) array_output.append(dict_line["SC-Status"]) array_output.append(dict_line["SC-Bytes"]) array_output.append(""" + dict_line["CS-Referer"] + """) array_output.append(""" + dict_line["CS-UserAgent"] + """) array_output.append(""-"") array_output.append(dict_line["S-IP"]) # print(" ".join(array_output)) # output_lines.append(" ".join(array_output)) output_f.write(" ".join(array_output) + ' ') except Exception as e: traceback.print_exc(file=sys.stdout) print(root + "/" + file_name) continue exit() if __name__ == '__main__': main()

#!/usr/bin/env python3 # coding: utf-8 import os import sys import traceback import re import gzip import time import multiprocessing from datetime import datetime from dateutil import tz from multiprocessing import Pool base_path = "/home/xuanjia/static.trthi.com" file_name_prefix = "F114BC2216604A2C93AF5F6821168CA5_" file_name_sufix = "_pca_cn_cas_001.log" def conv_date(input_date, input_hour): local_date = datetime.strptime(input_date + " " + input_hour, "%Y%m%d %H").replace( tzinfo=tz.gettz('UTC')).astimezone(tz.gettz('Asia/Shanghai')) re_date = [] re_date.append(local_date.strftime("%Y%m%d")) re_date.append(local_date.strftime("%H")) return re_date def proc_log(proc_list): proc_files = proc_list[0] proc_filename = proc_list[1] with open(proc_filename, 'w') as output_f: for f in proc_files: try: with gzip.open(f, 'rt', encoding='utf-8') as input_f: for input_line in input_f: array_line = input_line.split(' ') dict_line = {} dict_line["Event-Type"] = array_line[0] dict_line["Site-ID"] = array_line[1] dict_line["Date"] = array_line[2] dict_line["Time"] = array_line[3] dict_line["C-IP"] = array_line[4] dict_line["CS-UserName"] = array_line[5] dict_line["S-SiteName"] = array_line[6] dict_line["S-ComputerName"] = array_line[7] dict_line["S-IP"] = array_line[8] dict_line["S-Port"] = array_line[9] dict_line["CS-Method"] = array_line[10] dict_line["CS-URI"] = array_line[11] dict_line["CS-URI-Query"] = array_line[12] dict_line["SC-Status"] = array_line[13] dict_line["SC-Win32-Status"] = array_line[14] dict_line["SC-Bytes"] = array_line[15] dict_line["CS-Bytes"] = array_line[16] dict_line["Time-Taken"] = array_line[17] dict_line["CS-Version"] = array_line[18] dict_line["CS-Host"] = array_line[19] dict_line["CS-UserAgent"] = array_line[20] dict_line["CS-Cookie"] = array_line[21] dict_line["CS-Referer"] = array_line[22] dict_line["SC-Sub-Status"] = array_line[23] dict_line["CS-Range"] = array_line[24] dict_line["SC-Initial"] = array_line[25] dict_line["SC-Complete"] = array_line[26] dict_line["SC-ContentType"] = array_line[27] dict_line["Protocol"] = array_line[28] dict_line["SC-Bytes-Body"] = array_line[29] dict_line["Bytes-Origin-Uncompressed"] = array_line[30] dict_line["C-RemotePort"] = array_line[31] array_output = [] array_output.append(dict_line["C-IP"]) array_output.append("-") array_output.append("-") array_output.append("[" + datetime.strptime(dict_line["Date"] + " " + dict_line["Time"], "%Y-%m-%d %H:%M:%S").replace( tzinfo=tz.gettz('UTC')).astimezone(tz.gettz('Asia/Shanghai')).strftime( "%d/%b/%Y:%H:%M:%S +08:00") + "]") array_output.append(""" + dict_line["CS-Method"]) array_output.append( "http://" + dict_line["CS-Host"] + dict_line["CS-URI"] + "?" + dict_line["CS-URI-Query"]) array_output.append(dict_line["CS-Version"] + """) array_output.append(dict_line["SC-Status"]) array_output.append(dict_line["SC-Bytes"]) array_output.append(""" + dict_line["CS-Referer"] + """) array_output.append(""" + dict_line["CS-UserAgent"] + """) array_output.append(""-"") array_output.append(dict_line["S-IP"]) output_f.write(" ".join(array_output) + ' ') except Exception as e: traceback.print_exc(file=sys.stdout) print(f) continue def main(): global base_path global file_name_prefix global file_name_sufix array_subdirs = [] array_hours = [] array_proc_files = [] array_params = [] time_s = time.time() c_count = multiprocessing.cpu_count() array_subdirs = os.listdir(base_path) #目录下的文件/目录,放到这个列表下面 【'20170912','20170913'】 for i in range(0, 24, 1): array_hours.append('{0:02}'.format(i)) #日期格式 01 02 03 for hour in array_hours: for subdir in array_subdirs: f_name = conv_date(subdir, hour) #utc 转换为加8的时间, 格式【'20170912','09'】 array_proc_files = [] for root, dirs, files in os.walk(base_path + "/" + subdir): for file_name in files: #file_name 会列出子目录下的所有文件 #收集需要统计的日志文件,之中不包括upstream的,和_文件夹下的 if re.search(".*_upstream_.*", file_name) is None and re.search(".*_" + subdir + "_" + hour + "_.*", file_name) is not None: array_proc_files.append(os.path.join(root + "/" + file_name)) if len(array_proc_files) > 0: #输出的目标文件 array_params.append((array_proc_files, base_path + "/" + file_name_prefix + f_name[0] + "_" + f_name[ 1] + file_name_sufix)) '''for param in array_params: print(param)''' with Pool(processes=c_count) as pool: pool.map(proc_log, array_params) time_e = time.time() time_delta = time_e - time_s print("Using " + str(time_delta)) exit() if __name__ == '__main__': main()
2、域名item接口刷新
脚本里面涉及的内容:
1、针对域名对uri做收集
2、收集1000uri后做处理
3、url的截取domain、uri
4、request post 多key相同的情况下请求

#!/usr/bin/env python3 #python version 3 import sys import requests from urllib.parse import urlparse username = 'Mr.python' #input your username password = '*******' #input your password mailto = 'Mr.python@txnetworks.cn' #input your email-address if len(sys.argv) != 2: print('