暂时记录,改天再整理
import json
import re
import os
import pandas as pd
from requests import get
from docx import Document
import win32com.client as win
# import subprocess
import socket
import shutil
# import sys
from bs4 import BeautifulSoup
# import openpyxl
def doc_convert_to_docx(src_doc: str, to_docx: str):
"""
功能:将 doc 类型文件转换为 docx 类型文件
:param src_doc: doc 文档目录
:param to_docx: docx 文档目录
:return: None
"""
# subprocess.call(['soffice', '--headless', '--convert-to', 'docx', '--outdir', dir2 + str(i) + '.docx', dir + n])
if not os.path.isdir(src_doc):
os.mkdir(src_doc)
if not os.path.isdir(to_docx):
os.mkdir(to_docx)
word = win.Dispatch('Word.Application')
files = os.listdir(src_doc)
for n in files:
doc = word.Documents.Open(src_doc + n)
n = n.replace('.doc', '.docx')
doc.SaveAs(to_docx + n, FileFormat=12)
word.Quit()
def gen_statistics_table(src_dir: str, file: str):
"""
功能:生成统计表
:param src_dir: doc 文档目录
:param to: 统计表目录
:param file: 统计表名
:return: None
"""
data = {}
names = os.listdir(src_dir)
for (i, n) in enumerate(names):
doc = Document(docx=src_dir + n)
table = doc.tables[1]
row = table.rows[0]
row1 = table.rows[1]
for k, v in zip(row.cells, row1.cells):
if i == 0:
data[k.text] = [v.text]
else:
data[k.text].append(v.text)
df = pd.DataFrame(data)
df.to_excel(file, index=False, encoding='gbk')
def fmt_file_name(src_dir: str, path_dir: str, file: str):
"""
功能:按格式化修改文件名
:param src_dir: doc 文档目录
:param path_dir: docx 文档目录
:param file: 统计表路径名
:return: None
"""
data = pd.read_excel(file, encoding='gbk')
files = os.listdir(path_dir)
for name in files:
doc = Document(path_dir + name)
table = doc.tables[1]
row = table.rows[1]
col = row.cells[1]
for site_name in data['网站名称']:
if site_name == col.text:
gs = re.match(r'.*_(.*)_.*.docx', name)
domain = gs.group(1)
os.rename(path_dir + name, path_dir + site_name + '_' + domain + '.docx')
name = name.replace('.docx', '.doc')
os.rename(src_dir + name, src_dir + site_name + '_' + domain + '.doc')
def gen_bugs_and_access_table(src_dir: str, path_dir: str, file: str):
"""
功能:生成漏洞与可访问信息表
:param src_dir: 统计表目录
:param path_dir: docx 文档目录
:param file: 统计表名
:return: None
"""
data = pd.read_excel(src_dir + file, encoding='gbk')
urls = data['网站URL']
pressings = data['紧急']
highs = data['高危']
mediums = data['中危']
names = os.listdir(path_dir)
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
}
access = {'网站网址': [], '网站名称': [], '网站域名': [], '网站IP': [], '网站是否可正常访问': [], '响应状态码': []}
level = {'网站网址': [], '网站名称': [], '网站域名': [], '紧急数量': [], '高危数量': [], '中危数量': [], '漏洞详情': []}
for url, pressing, high, medium in zip(urls, pressings, highs, mediums):
mat = re.match(r'https?://(.*..*)/?', url)
domain = mat.group(1).replace('/', '')
ip = ''
try:
ip = socket.gethostbyname(domain)
except:
pass
access['网站网址'].append(url)
access['网站域名'].append(domain)
access['网站IP'].append(ip)
if pressing > 0:
for f in names:
doc = Document(path_dir + f)
table = doc.tables[1]
row = table.rows[1]
col = row.cells[0]
if col.text == url:
table2 = doc.tables[2]
row2 = table2.rows[1]
col2 = row2.cells[0]
level['网站网址'].append(url)
level['网站名称'].append(row.cells[1].text)
level['网站域名'].append(domain)
level['漏洞详情'].append(col2.text)
level['紧急数量'].append(pressing)
if high > 0:
level['高危数量'].append(high)
else:
level['高危数量'].append(0)
if medium > 0:
level['中危数量'].append(medium)
else:
level['中危数量'].append(0)
elif high > 0:
for f in names:
doc = Document(path_dir + f)
table = doc.tables[1]
row = table.rows[1]
col = row.cells[0]
if col.text == url:
table2 = doc.tables[2]
row2 = table2.rows[1]
col2 = row2.cells[0]
level['漏洞详情'].append(col2.text)
level['网站网址'].append(url)
level['网站名称'].append(row.cells[1].text)
level['网站域名'].append(domain)
level['高危数量'].append(high)
if pressing > 0:
level['紧急数量'].append(pressing)
else:
level['紧急数量'].append(0)
if medium > 0:
level['中危数量'].append(medium)
else:
level['中危数量'].append(0)
elif medium > 0:
for f in names:
doc = Document(path_dir + f)
table = doc.tables[1]
row = table.rows[1]
col = row.cells[0]
if col.text == url:
table2 = doc.tables[2]
row2 = table2.rows[1]
col2 = row2.cells[0]
level['漏洞详情'].append(col2.text)
level['网站网址'].append(url)
level['网站名称'].append(row.cells[1].text)
level['网站域名'].append(domain)
level['中危数量'].append(medium)
if pressing > 0:
level['紧急数量'].append(pressing)
else:
level['紧急数量'].append(0)
if medium > 0:
level['高危数量'].append(high)
else:
level['高危数量'].append(0)
try:
headers['Host'] = domain
r = get(url, headers=headers, timeout=10)
html = r.text
soup = BeautifulSoup(html, 'html.parser')
title = soup.title.string.encode('gbk')
if 200 <= r.status_code <= 301:
access['网站名称'].append(title)
access['网站是否可正常访问'].append('是')
access['响应状态码'].append(r.status_code)
else:
access['网站名称'].append('无法访问')
access['网站是否可正常访问'].append('否')
access['响应状态码'].append(r.status_code)
except:
access['网站名称'].append('无法访问')
access['网站是否可正常访问'].append('否')
access['响应状态码'].append('null')
access_list = pd.DataFrame(access)
access_list.to_excel(src_dir + '网站可访问信息.xlsx', index=False, encoding='gbk')
security_info = pd.DataFrame(level)
security_info.to_excel(src_dir + '中高危网站详情.xlsx', index=False, encoding='gbk')
def gen_bugs_report(src: str):
files = os.listdir(src)
press_docx = Document()
high_docx = Document()
medium_docx = Document()
low_docx = Document()
for _ in files:
doc = Document(src + _)
fullText = []
for para in doc.paragraphs:
fullText.append(para.text)
text = ''.join(fullText)
pressing = re.match(r'.*3.1.(d+).*', text)
high = re.match(r'.*3.2.(d+).*', text)
medium = re.match(r'.*3.3.(d+).*', text)
low = re.match(r'.*3.4.(d+).*', text)
if pressing:
pressing = int(pressing.group(1))
cols = []
rows = []
for table in doc.tables[3: 3 + pressing]:
for row in table.rows:
rows.append(row.cells[0])
for col in row.cells[1:]:
cols.append(col)
press_docx.add_table(rows, cols)
if high:
cols = []
rows = []
high = int(high.group(1))
for table in doc.tables[3: 3 + high]:
for row in table.rows:
rows.append(row.cells[0])
for col in row.cells[1:]:
cols.append(col)
high_docx.add_table(rows, cols)
if medium:
medium = int(medium.group(1))
cols = []
rows = []
if pressing:
for table in doc.tables[3 + pressing: 3 + pressing + medium]:
for row in table.rows:
rows.append(table.cells[0])
for col in row.cells[1:]:
cols.append(col)
medium_docx.add_table(rows, cols)
elif high:
for table in doc.tables[3 + high: 3 + high + medium]:
for row in table.rows:
rows.append(row.cells[0])
for col in row.cells[1:]:
cols.append(col)
medium_docx.add_table(rows, cols)
else:
for table in doc.tables[3: 3 + medium]:
for row in table.rows:
rows.append(row.cells[0])
for col in row.cells[1:]:
cols.append(col)
medium_docx.add_table(rows, cols)
if low:
low = int(low.group(1))
cols = []
rows = []
if pressing and medium:
for table in doc.tables[3 + pressing + medium: 3 + pressing + medium + low]:
for row in table.rows:
rows.append(row.cells[0])
for col in row.cells[1:]:
cols.append(col)
medium_docx.add_table(rows, cols)
elif high and medium:
for table in doc.tables[3 + high + medium: 3 + high + + medium + low]:
for row in table.rows:
rows.append(row.cells[0])
for col in row.cells[1:]:
cols.append(col)
medium_docx.add_table(rows, cols)
elif pressing:
for table in doc.tables[3 + pressing: 3 + pressing + low]:
for row in table.rows:
rows.append(row.cells[0])
for col in row.cells[1:]:
cols.append(col)
medium_docx.add_table(rows, cols)
elif high:
for table in doc.tables[3 + high: 3 + high + low]:
for row in table.rows:
rows.append(row.cells[0])
for col in row.cells[1:]:
cols.append(col)
medium_docx.add_table(rows, cols)
else:
for table in doc.tables[3: 3 + low]:
for row in table.rows:
rows.append(row.cells[0])
for col in row.cells[1:]:
cols.append(col)
medium_docx.add_table(rows, cols)
press_docx.save(src + 'pressing.docx')
high_docx.save(src + 'high.docx')
medium_docx.save(src + 'medium.docx')
low_docx.save(src + 'low.docx')
def split_by_area(src: str, file: str):
files = os.listdir(src)
fp = open(file, encoding='utf8')
area_conf = json.load(fp)
for f in files:
doc = Document(src + f)
table = doc.tables[0]
row = table.rows[0]
col = row.cells[0]
text = col.text
for v in area_conf.values():
extract = re.search(r'.*(' + v + r')$', text)
if extract:
area_n = extract.group(1)
else:
area_n = ''
if area_n:
if not os.path.exists(src + v):
os.mkdir(src + v)
shutil.move(src + f, src + v + '/' + f)
break
else:
shutil.move(src + f, src + area_conf['0'] + '/' + f)
if __name__ == '__main__':
src = input('请输入文件目录:')
doc_ = 'doc_/'
docx_ = 'docx_/'
convert_to = src + docx_
statistics_table_name = '漏洞统计.xlsx'
doc_convert_to_docx(src_doc=src + doc_, to_docx=convert_to)
gen_statistics_table(src_dir=convert_to, file=src + statistics_table_name)
gen_bugs_and_access_table(src_dir=src, path_dir=convert_to, file=statistics_table_name)
fmt_file_name(src_dir=src + doc_, path_dir=convert_to, file=src + statistics_table_name)
还有一个(要求是根据单位名称按给定的单位类型和行业类型做“合适的”选择(说到这个就想吐槽几句)。:
import pandas as pd
import json
import re
f = open('../auto_conf.json', 'r', encoding='gbk')
import_conf = json.load(f)
sheel = '../1.xlsx'
data = pd.read_excel(sheel, index=False, encoding='gbk')
companies = []
industry = []
for s in data['单位名称']:
com = import_conf['单位类型']
ind = import_conf['行业类型']
for k in com.keys():
m = re.match(r'.*(' + k + r').*', s)
if m:
companies.append(com[k])
break
else:
companies.append('其它')
for k in ind.keys():
m = re.search(r'.*(' + k + r').*', s)
if m:
industry.append(ind[k])
break
else:
industry.append('其它')
data.update({'单位类型': companies, '行业类型': industry})
df = pd.DataFrame(data)
df.to_excel('C:/Users/User/Documents/1.xlsx', index=False, encoding='gbk')
这个 auto_conf.json 是我自己弄的,类似做机器学习的时候自己弄数据集,但这里数据量太小,才500个网站,不好用机器学习,而且自建数据集麻烦,所以选择了用 regex 处理。
auto_conf.json:
{
"单位类型": {
"环境保护": "政府机关",
"财政": "政府机关",
"局": "事业单位",
"小学": "事业单位",
"中学": "事业单位",
"大学": "事业单位",
"学院": "事业单位",
"学校": "事业单位",
"学": "其它",
"公司": "企业",
"中心": "事业单位",
"会": "党委机关",
"所": "事业单位",
"站": "事业单位",
"队": "事业单位",
"院": "事业单位",
"社": "社团组织",
"台": "事业单位",
"宫": "事业单位",
"馆": "事业单位",
"党政": "党委机关",
"监狱": "政府机关",
"政府": "政府机关"
},
"行业类型": {
"卫生": "卫生",
"学校": "教育",
"小学": "教育",
"中学": "教育",
"大学": "教育",
"学院": "教育",
"贴吧": "经营性公众互联网",
"学点": "经营性公众互联网",
"网": "经营性公众互联网",
"医院": "卫生",
"政府": "政府部门",
"委员": "政府部门",
"通信": "电信",
"广播": "广电",
"监狱": "司法",
"铁路": "铁路",
"银行": "银行",
"海关": "海关",
"税务": "税务",
"民航": "民航",
"电力": "电力",
"证券": "证券",
"保险": "保险",
"水务": "水利",
"公安": "公安",
"财政": "财政",
"审计": "审计",
"贸易": "商业贸易",
"国土": "国土资源",
"冶": "能源",
"统计": "统计",
"行政管理": "行政管理",
"邮政": "邮政",
"教育": "教育",
"农业": "农业",
"水利": "水利",
"文化": "文化",
"科技": "科技",
"广告": "宣传",
"监督检测": "质量监督检验检疫",
"人力": "人事劳动和社会保障",
"气象": "统计"
}
}
一些参考链接:
1.https://code.activestate.com/recipes/279003-converting-word-documents-to-text/
2.https://stackoverflow.com/questions/1468099/python-win32-extensions-documentation
4.https://stackoverflow.com/questions/38468442/multiple-doc-to-docx-file-conversion-using-python