暂时记录,改天再整理
import json import re import os import pandas as pd from requests import get from docx import Document import win32com.client as win # import subprocess import socket import shutil # import sys from bs4 import BeautifulSoup # import openpyxl def doc_convert_to_docx(src_doc: str, to_docx: str): """ 功能:将 doc 类型文件转换为 docx 类型文件 :param src_doc: doc 文档目录 :param to_docx: docx 文档目录 :return: None """ # subprocess.call(['soffice', '--headless', '--convert-to', 'docx', '--outdir', dir2 + str(i) + '.docx', dir + n]) if not os.path.isdir(src_doc): os.mkdir(src_doc) if not os.path.isdir(to_docx): os.mkdir(to_docx) word = win.Dispatch('Word.Application') files = os.listdir(src_doc) for n in files: doc = word.Documents.Open(src_doc + n) n = n.replace('.doc', '.docx') doc.SaveAs(to_docx + n, FileFormat=12) word.Quit() def gen_statistics_table(src_dir: str, file: str): """ 功能:生成统计表 :param src_dir: doc 文档目录 :param to: 统计表目录 :param file: 统计表名 :return: None """ data = {} names = os.listdir(src_dir) for (i, n) in enumerate(names): doc = Document(docx=src_dir + n) table = doc.tables[1] row = table.rows[0] row1 = table.rows[1] for k, v in zip(row.cells, row1.cells): if i == 0: data[k.text] = [v.text] else: data[k.text].append(v.text) df = pd.DataFrame(data) df.to_excel(file, index=False, encoding='gbk') def fmt_file_name(src_dir: str, path_dir: str, file: str): """ 功能:按格式化修改文件名 :param src_dir: doc 文档目录 :param path_dir: docx 文档目录 :param file: 统计表路径名 :return: None """ data = pd.read_excel(file, encoding='gbk') files = os.listdir(path_dir) for name in files: doc = Document(path_dir + name) table = doc.tables[1] row = table.rows[1] col = row.cells[1] for site_name in data['网站名称']: if site_name == col.text: gs = re.match(r'.*_(.*)_.*.docx', name) domain = gs.group(1) os.rename(path_dir + name, path_dir + site_name + '_' + domain + '.docx') name = name.replace('.docx', '.doc') os.rename(src_dir + name, src_dir + site_name + '_' + domain + '.doc') def gen_bugs_and_access_table(src_dir: str, path_dir: str, file: str): """ 功能:生成漏洞与可访问信息表 :param src_dir: 统计表目录 :param path_dir: docx 文档目录 :param file: 统计表名 :return: None """ data = pd.read_excel(src_dir + file, encoding='gbk') urls = data['网站URL'] pressings = data['紧急'] highs = data['高危'] mediums = data['中危'] names = os.listdir(path_dir) headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36' } access = {'网站网址': [], '网站名称': [], '网站域名': [], '网站IP': [], '网站是否可正常访问': [], '响应状态码': []} level = {'网站网址': [], '网站名称': [], '网站域名': [], '紧急数量': [], '高危数量': [], '中危数量': [], '漏洞详情': []} for url, pressing, high, medium in zip(urls, pressings, highs, mediums): mat = re.match(r'https?://(.*..*)/?', url) domain = mat.group(1).replace('/', '') ip = '' try: ip = socket.gethostbyname(domain) except: pass access['网站网址'].append(url) access['网站域名'].append(domain) access['网站IP'].append(ip) if pressing > 0: for f in names: doc = Document(path_dir + f) table = doc.tables[1] row = table.rows[1] col = row.cells[0] if col.text == url: table2 = doc.tables[2] row2 = table2.rows[1] col2 = row2.cells[0] level['网站网址'].append(url) level['网站名称'].append(row.cells[1].text) level['网站域名'].append(domain) level['漏洞详情'].append(col2.text) level['紧急数量'].append(pressing) if high > 0: level['高危数量'].append(high) else: level['高危数量'].append(0) if medium > 0: level['中危数量'].append(medium) else: level['中危数量'].append(0) elif high > 0: for f in names: doc = Document(path_dir + f) table = doc.tables[1] row = table.rows[1] col = row.cells[0] if col.text == url: table2 = doc.tables[2] row2 = table2.rows[1] col2 = row2.cells[0] level['漏洞详情'].append(col2.text) level['网站网址'].append(url) level['网站名称'].append(row.cells[1].text) level['网站域名'].append(domain) level['高危数量'].append(high) if pressing > 0: level['紧急数量'].append(pressing) else: level['紧急数量'].append(0) if medium > 0: level['中危数量'].append(medium) else: level['中危数量'].append(0) elif medium > 0: for f in names: doc = Document(path_dir + f) table = doc.tables[1] row = table.rows[1] col = row.cells[0] if col.text == url: table2 = doc.tables[2] row2 = table2.rows[1] col2 = row2.cells[0] level['漏洞详情'].append(col2.text) level['网站网址'].append(url) level['网站名称'].append(row.cells[1].text) level['网站域名'].append(domain) level['中危数量'].append(medium) if pressing > 0: level['紧急数量'].append(pressing) else: level['紧急数量'].append(0) if medium > 0: level['高危数量'].append(high) else: level['高危数量'].append(0) try: headers['Host'] = domain r = get(url, headers=headers, timeout=10) html = r.text soup = BeautifulSoup(html, 'html.parser') title = soup.title.string.encode('gbk') if 200 <= r.status_code <= 301: access['网站名称'].append(title) access['网站是否可正常访问'].append('是') access['响应状态码'].append(r.status_code) else: access['网站名称'].append('无法访问') access['网站是否可正常访问'].append('否') access['响应状态码'].append(r.status_code) except: access['网站名称'].append('无法访问') access['网站是否可正常访问'].append('否') access['响应状态码'].append('null') access_list = pd.DataFrame(access) access_list.to_excel(src_dir + '网站可访问信息.xlsx', index=False, encoding='gbk') security_info = pd.DataFrame(level) security_info.to_excel(src_dir + '中高危网站详情.xlsx', index=False, encoding='gbk') def gen_bugs_report(src: str): files = os.listdir(src) press_docx = Document() high_docx = Document() medium_docx = Document() low_docx = Document() for _ in files: doc = Document(src + _) fullText = [] for para in doc.paragraphs: fullText.append(para.text) text = ''.join(fullText) pressing = re.match(r'.*3.1.(d+).*', text) high = re.match(r'.*3.2.(d+).*', text) medium = re.match(r'.*3.3.(d+).*', text) low = re.match(r'.*3.4.(d+).*', text) if pressing: pressing = int(pressing.group(1)) cols = [] rows = [] for table in doc.tables[3: 3 + pressing]: for row in table.rows: rows.append(row.cells[0]) for col in row.cells[1:]: cols.append(col) press_docx.add_table(rows, cols) if high: cols = [] rows = [] high = int(high.group(1)) for table in doc.tables[3: 3 + high]: for row in table.rows: rows.append(row.cells[0]) for col in row.cells[1:]: cols.append(col) high_docx.add_table(rows, cols) if medium: medium = int(medium.group(1)) cols = [] rows = [] if pressing: for table in doc.tables[3 + pressing: 3 + pressing + medium]: for row in table.rows: rows.append(table.cells[0]) for col in row.cells[1:]: cols.append(col) medium_docx.add_table(rows, cols) elif high: for table in doc.tables[3 + high: 3 + high + medium]: for row in table.rows: rows.append(row.cells[0]) for col in row.cells[1:]: cols.append(col) medium_docx.add_table(rows, cols) else: for table in doc.tables[3: 3 + medium]: for row in table.rows: rows.append(row.cells[0]) for col in row.cells[1:]: cols.append(col) medium_docx.add_table(rows, cols) if low: low = int(low.group(1)) cols = [] rows = [] if pressing and medium: for table in doc.tables[3 + pressing + medium: 3 + pressing + medium + low]: for row in table.rows: rows.append(row.cells[0]) for col in row.cells[1:]: cols.append(col) medium_docx.add_table(rows, cols) elif high and medium: for table in doc.tables[3 + high + medium: 3 + high + + medium + low]: for row in table.rows: rows.append(row.cells[0]) for col in row.cells[1:]: cols.append(col) medium_docx.add_table(rows, cols) elif pressing: for table in doc.tables[3 + pressing: 3 + pressing + low]: for row in table.rows: rows.append(row.cells[0]) for col in row.cells[1:]: cols.append(col) medium_docx.add_table(rows, cols) elif high: for table in doc.tables[3 + high: 3 + high + low]: for row in table.rows: rows.append(row.cells[0]) for col in row.cells[1:]: cols.append(col) medium_docx.add_table(rows, cols) else: for table in doc.tables[3: 3 + low]: for row in table.rows: rows.append(row.cells[0]) for col in row.cells[1:]: cols.append(col) medium_docx.add_table(rows, cols) press_docx.save(src + 'pressing.docx') high_docx.save(src + 'high.docx') medium_docx.save(src + 'medium.docx') low_docx.save(src + 'low.docx') def split_by_area(src: str, file: str): files = os.listdir(src) fp = open(file, encoding='utf8') area_conf = json.load(fp) for f in files: doc = Document(src + f) table = doc.tables[0] row = table.rows[0] col = row.cells[0] text = col.text for v in area_conf.values(): extract = re.search(r'.*(' + v + r')$', text) if extract: area_n = extract.group(1) else: area_n = '' if area_n: if not os.path.exists(src + v): os.mkdir(src + v) shutil.move(src + f, src + v + '/' + f) break else: shutil.move(src + f, src + area_conf['0'] + '/' + f) if __name__ == '__main__': src = input('请输入文件目录:') doc_ = 'doc_/' docx_ = 'docx_/' convert_to = src + docx_ statistics_table_name = '漏洞统计.xlsx' doc_convert_to_docx(src_doc=src + doc_, to_docx=convert_to) gen_statistics_table(src_dir=convert_to, file=src + statistics_table_name) gen_bugs_and_access_table(src_dir=src, path_dir=convert_to, file=statistics_table_name) fmt_file_name(src_dir=src + doc_, path_dir=convert_to, file=src + statistics_table_name)
还有一个(要求是根据单位名称按给定的单位类型和行业类型做“合适的”选择(说到这个就想吐槽几句)。:
import pandas as pd import json import re f = open('../auto_conf.json', 'r', encoding='gbk') import_conf = json.load(f) sheel = '../1.xlsx' data = pd.read_excel(sheel, index=False, encoding='gbk') companies = [] industry = [] for s in data['单位名称']: com = import_conf['单位类型'] ind = import_conf['行业类型'] for k in com.keys(): m = re.match(r'.*(' + k + r').*', s) if m: companies.append(com[k]) break else: companies.append('其它') for k in ind.keys(): m = re.search(r'.*(' + k + r').*', s) if m: industry.append(ind[k]) break else: industry.append('其它') data.update({'单位类型': companies, '行业类型': industry}) df = pd.DataFrame(data) df.to_excel('C:/Users/User/Documents/1.xlsx', index=False, encoding='gbk')
这个 auto_conf.json 是我自己弄的,类似做机器学习的时候自己弄数据集,但这里数据量太小,才500个网站,不好用机器学习,而且自建数据集麻烦,所以选择了用 regex 处理。
auto_conf.json:
{ "单位类型": { "环境保护": "政府机关", "财政": "政府机关", "局": "事业单位", "小学": "事业单位", "中学": "事业单位", "大学": "事业单位", "学院": "事业单位", "学校": "事业单位", "学": "其它", "公司": "企业", "中心": "事业单位", "会": "党委机关", "所": "事业单位", "站": "事业单位", "队": "事业单位", "院": "事业单位", "社": "社团组织", "台": "事业单位", "宫": "事业单位", "馆": "事业单位", "党政": "党委机关", "监狱": "政府机关", "政府": "政府机关" }, "行业类型": { "卫生": "卫生", "学校": "教育", "小学": "教育", "中学": "教育", "大学": "教育", "学院": "教育", "贴吧": "经营性公众互联网", "学点": "经营性公众互联网", "网": "经营性公众互联网", "医院": "卫生", "政府": "政府部门", "委员": "政府部门", "通信": "电信", "广播": "广电", "监狱": "司法", "铁路": "铁路", "银行": "银行", "海关": "海关", "税务": "税务", "民航": "民航", "电力": "电力", "证券": "证券", "保险": "保险", "水务": "水利", "公安": "公安", "财政": "财政", "审计": "审计", "贸易": "商业贸易", "国土": "国土资源", "冶": "能源", "统计": "统计", "行政管理": "行政管理", "邮政": "邮政", "教育": "教育", "农业": "农业", "水利": "水利", "文化": "文化", "科技": "科技", "广告": "宣传", "监督检测": "质量监督检验检疫", "人力": "人事劳动和社会保障", "气象": "统计" } }
一些参考链接:
1.https://code.activestate.com/recipes/279003-converting-word-documents-to-text/
2.https://stackoverflow.com/questions/1468099/python-win32-extensions-documentation
4.https://stackoverflow.com/questions/38468442/multiple-doc-to-docx-file-conversion-using-python