zoukankan      html  css  js  c++  java
  • 关于文档处理的各种脚本

      暂时记录,改天再整理

    import json
    import re
    import os
    import pandas as pd
    from requests import get
    from docx import Document
    import win32com.client as win
    # import subprocess
    import socket
    import shutil
    # import sys
    from bs4 import BeautifulSoup
    # import openpyxl
    
    
    def doc_convert_to_docx(src_doc: str, to_docx: str):
        """
        功能:将 doc 类型文件转换为 docx 类型文件
        :param src_doc: doc 文档目录
        :param to_docx: docx 文档目录
        :return: None
        """
        # subprocess.call(['soffice', '--headless', '--convert-to', 'docx', '--outdir', dir2 + str(i) + '.docx', dir + n])
        if not os.path.isdir(src_doc):
            os.mkdir(src_doc)
        if not os.path.isdir(to_docx):
            os.mkdir(to_docx)
        word = win.Dispatch('Word.Application')
        files = os.listdir(src_doc)
        for n in files:
            doc = word.Documents.Open(src_doc + n)
            n = n.replace('.doc', '.docx')
            doc.SaveAs(to_docx + n, FileFormat=12)
        word.Quit()
    
    
    def gen_statistics_table(src_dir: str, file: str):
        """
        功能:生成统计表
        :param src_dir: doc 文档目录
        :param to: 统计表目录
        :param file: 统计表名
        :return: None
        """
        data = {}
        names = os.listdir(src_dir)
        for (i, n) in enumerate(names):
            doc = Document(docx=src_dir + n)
            table = doc.tables[1]
            row = table.rows[0]
            row1 = table.rows[1]
            for k, v in zip(row.cells, row1.cells):
                if i == 0:
                    data[k.text] = [v.text]
                else:
                    data[k.text].append(v.text)
        df = pd.DataFrame(data)
        df.to_excel(file, index=False, encoding='gbk')
    
    
    def fmt_file_name(src_dir: str, path_dir: str, file: str):
        """
        功能:按格式化修改文件名
        :param src_dir: doc 文档目录
        :param path_dir: docx 文档目录
        :param file: 统计表路径名
        :return: None
        """
        data = pd.read_excel(file, encoding='gbk')
        files = os.listdir(path_dir)
        for name in files:
            doc = Document(path_dir + name)
            table = doc.tables[1]
            row = table.rows[1]
            col = row.cells[1]
            for site_name in data['网站名称']:
                if site_name == col.text:
                    gs = re.match(r'.*_(.*)_.*.docx', name)
                    domain = gs.group(1)
                    os.rename(path_dir + name, path_dir + site_name + '_' + domain + '.docx')
                    name = name.replace('.docx', '.doc')
                    os.rename(src_dir + name, src_dir + site_name + '_' + domain + '.doc')
    
    
    def gen_bugs_and_access_table(src_dir: str, path_dir: str, file: str):
        """
        功能:生成漏洞与可访问信息表
        :param src_dir: 统计表目录
        :param path_dir: docx 文档目录
        :param file: 统计表名
        :return: None
        """
        data = pd.read_excel(src_dir + file, encoding='gbk')
        urls = data['网站URL']
        pressings = data['紧急']
        highs = data['高危']
        mediums = data['中危']
        names = os.listdir(path_dir)
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
        }
        access = {'网站网址': [], '网站名称': [], '网站域名': [], '网站IP': [], '网站是否可正常访问': [], '响应状态码': []}
        level = {'网站网址': [], '网站名称': [], '网站域名': [], '紧急数量': [], '高危数量': [], '中危数量': [], '漏洞详情': []}
        for url, pressing, high, medium in zip(urls, pressings, highs, mediums):
            mat = re.match(r'https?://(.*..*)/?', url)
            domain = mat.group(1).replace('/', '')
            ip = ''
            try:
                ip = socket.gethostbyname(domain)
            except:
                pass
            access['网站网址'].append(url)
            access['网站域名'].append(domain)
            access['网站IP'].append(ip)
            if pressing > 0:
                for f in names:
                    doc = Document(path_dir + f)
                    table = doc.tables[1]
                    row = table.rows[1]
                    col = row.cells[0]
                    if col.text == url:
                        table2 = doc.tables[2]
                        row2 = table2.rows[1]
                        col2 = row2.cells[0]
                        level['网站网址'].append(url)
                        level['网站名称'].append(row.cells[1].text)
                        level['网站域名'].append(domain)
                        level['漏洞详情'].append(col2.text)
                        level['紧急数量'].append(pressing)
                        if high > 0:
                            level['高危数量'].append(high)
                        else:
                            level['高危数量'].append(0)
                        if medium > 0:
                            level['中危数量'].append(medium)
                        else:
                            level['中危数量'].append(0)
            elif high > 0:
                for f in names:
                    doc = Document(path_dir + f)
                    table = doc.tables[1]
                    row = table.rows[1]
                    col = row.cells[0]
                    if col.text == url:
                        table2 = doc.tables[2]
                        row2 = table2.rows[1]
                        col2 = row2.cells[0]
                        level['漏洞详情'].append(col2.text)
                        level['网站网址'].append(url)
                        level['网站名称'].append(row.cells[1].text)
                        level['网站域名'].append(domain)
                        level['高危数量'].append(high)
                        if pressing > 0:
                            level['紧急数量'].append(pressing)
                        else:
                            level['紧急数量'].append(0)
                        if medium > 0:
                            level['中危数量'].append(medium)
                        else:
                            level['中危数量'].append(0)
            elif medium > 0:
                for f in names:
                    doc = Document(path_dir + f)
                    table = doc.tables[1]
                    row = table.rows[1]
                    col = row.cells[0]
                    if col.text == url:
                        table2 = doc.tables[2]
                        row2 = table2.rows[1]
                        col2 = row2.cells[0]
                        level['漏洞详情'].append(col2.text)
                        level['网站网址'].append(url)
                        level['网站名称'].append(row.cells[1].text)
                        level['网站域名'].append(domain)
                        level['中危数量'].append(medium)
                        if pressing > 0:
                            level['紧急数量'].append(pressing)
                        else:
                            level['紧急数量'].append(0)
                        if medium > 0:
                            level['高危数量'].append(high)
                        else:
                            level['高危数量'].append(0)
            try:
                headers['Host'] = domain
                r = get(url, headers=headers, timeout=10)
                html = r.text
                soup = BeautifulSoup(html, 'html.parser')
                title = soup.title.string.encode('gbk')
                if 200 <= r.status_code <= 301:
                    access['网站名称'].append(title)
                    access['网站是否可正常访问'].append('是')
                    access['响应状态码'].append(r.status_code)
                else:
                    access['网站名称'].append('无法访问')
                    access['网站是否可正常访问'].append('否')
                    access['响应状态码'].append(r.status_code)
            except:
                access['网站名称'].append('无法访问')
                access['网站是否可正常访问'].append('否')
                access['响应状态码'].append('null')
    
        access_list = pd.DataFrame(access)
        access_list.to_excel(src_dir + '网站可访问信息.xlsx', index=False, encoding='gbk')
        security_info = pd.DataFrame(level)
        security_info.to_excel(src_dir + '中高危网站详情.xlsx', index=False, encoding='gbk')
    
    
    def gen_bugs_report(src: str):
        files = os.listdir(src)
        press_docx = Document()
        high_docx = Document()
        medium_docx = Document()
        low_docx = Document()
        for _ in files:
            doc = Document(src + _)
            fullText = []
            for para in doc.paragraphs:
                fullText.append(para.text)
            text = ''.join(fullText)
            pressing = re.match(r'.*3.1.(d+).*', text)
            high = re.match(r'.*3.2.(d+).*', text)
            medium = re.match(r'.*3.3.(d+).*', text)
            low = re.match(r'.*3.4.(d+).*', text)
            if pressing:
                pressing = int(pressing.group(1))
                cols = []
                rows = []
                for table in doc.tables[3: 3 + pressing]:
                    for row in table.rows:
                        rows.append(row.cells[0])
                        for col in row.cells[1:]:
                            cols.append(col)
                    press_docx.add_table(rows, cols)
            if high:
                cols = []
                rows = []
                high = int(high.group(1))
                for table in doc.tables[3: 3 + high]:
                    for row in table.rows:
                        rows.append(row.cells[0])
                        for col in row.cells[1:]:
                            cols.append(col)
                    high_docx.add_table(rows, cols)
            if medium:
                medium = int(medium.group(1))
                cols = []
                rows = []
                if pressing:
                    for table in doc.tables[3 + pressing: 3 + pressing + medium]:
                        for row in table.rows:
                            rows.append(table.cells[0])
                            for col in row.cells[1:]:
                                cols.append(col)
                        medium_docx.add_table(rows, cols)
                elif high:
                    for table in doc.tables[3 + high: 3 + high + medium]:
                        for row in table.rows:
                            rows.append(row.cells[0])
                            for col in row.cells[1:]:
                                cols.append(col)
                        medium_docx.add_table(rows, cols)
                else:
                    for table in doc.tables[3: 3 + medium]:
                        for row in table.rows:
                            rows.append(row.cells[0])
                            for col in row.cells[1:]:
                                cols.append(col)
                        medium_docx.add_table(rows, cols)
            if low:
                low = int(low.group(1))
                cols = []
                rows = []
                if pressing and medium:
                    for table in doc.tables[3 + pressing + medium: 3 + pressing + medium + low]:
                        for row in table.rows:
                            rows.append(row.cells[0])
                            for col in row.cells[1:]:
                                cols.append(col)
                        medium_docx.add_table(rows, cols)
                elif high and medium:
                    for table in doc.tables[3 + high + medium: 3 + high + + medium + low]:
                        for row in table.rows:
                            rows.append(row.cells[0])
                            for col in row.cells[1:]:
                                cols.append(col)
                        medium_docx.add_table(rows, cols)
                elif pressing:
                    for table in doc.tables[3 + pressing: 3 + pressing + low]:
                        for row in table.rows:
                            rows.append(row.cells[0])
                            for col in row.cells[1:]:
                                cols.append(col)
                        medium_docx.add_table(rows, cols)
                elif high:
                    for table in doc.tables[3 + high: 3 + high + low]:
                        for row in table.rows:
                            rows.append(row.cells[0])
                            for col in row.cells[1:]:
                                cols.append(col)
                        medium_docx.add_table(rows, cols)
                else:
                    for table in doc.tables[3: 3 + low]:
                        for row in table.rows:
                            rows.append(row.cells[0])
                            for col in row.cells[1:]:
                                cols.append(col)
                        medium_docx.add_table(rows, cols)
        press_docx.save(src + 'pressing.docx')
        high_docx.save(src + 'high.docx')
        medium_docx.save(src + 'medium.docx')
        low_docx.save(src + 'low.docx')
    
    
    def split_by_area(src: str, file: str):
        files = os.listdir(src)
        fp = open(file, encoding='utf8')
        area_conf = json.load(fp)
        for f in files:
            doc = Document(src + f)
            table = doc.tables[0]
            row = table.rows[0]
            col = row.cells[0]
            text = col.text
            for v in area_conf.values():
                extract = re.search(r'.*(' + v + r')$', text)
                if extract:
                    area_n = extract.group(1)
                else:
                    area_n = ''
                if area_n:
                    if not os.path.exists(src + v):
                        os.mkdir(src + v)
                    shutil.move(src + f, src + v + '/' + f)
                    break
            else:
                shutil.move(src + f, src + area_conf['0'] + '/' + f)
    
    
    if __name__ == '__main__':
        src = input('请输入文件目录:')
        doc_ = 'doc_/'
        docx_ = 'docx_/'
        convert_to = src + docx_
        statistics_table_name = '漏洞统计.xlsx'
    
        doc_convert_to_docx(src_doc=src + doc_, to_docx=convert_to)
        gen_statistics_table(src_dir=convert_to, file=src + statistics_table_name)
        gen_bugs_and_access_table(src_dir=src, path_dir=convert_to, file=statistics_table_name)
        fmt_file_name(src_dir=src + doc_, path_dir=convert_to, file=src + statistics_table_name)
    

      还有一个(要求是根据单位名称按给定的单位类型和行业类型做“合适的”选择(说到这个就想吐槽几句)。:

    import pandas as pd
    import json
    import re
    
    
    f = open('../auto_conf.json', 'r', encoding='gbk')
    import_conf = json.load(f)
    sheel = '../1.xlsx'
    data = pd.read_excel(sheel, index=False, encoding='gbk')
    
    companies = []
    industry = []
    
    for s in data['单位名称']:
        com = import_conf['单位类型']
        ind = import_conf['行业类型']
        for k in com.keys():
            m = re.match(r'.*(' + k + r').*', s)
            if m:
                companies.append(com[k])
                break
        else:
            companies.append('其它')
    
        for k in ind.keys():
            m = re.search(r'.*(' + k + r').*', s)
            if m:
                industry.append(ind[k])
                break
        else:
            industry.append('其它')
    data.update({'单位类型': companies, '行业类型': industry})
    df = pd.DataFrame(data)
    df.to_excel('C:/Users/User/Documents/1.xlsx', index=False, encoding='gbk')
    

      这个 auto_conf.json 是我自己弄的,类似做机器学习的时候自己弄数据集,但这里数据量太小,才500个网站,不好用机器学习,而且自建数据集麻烦,所以选择了用 regex 处理。

      auto_conf.json:

    {
        "单位类型": {
            "环境保护": "政府机关",
            "财政": "政府机关",
            "局": "事业单位",
            "小学": "事业单位",
            "中学": "事业单位",
            "大学": "事业单位",
            "学院": "事业单位",
            "学校": "事业单位",
            "学": "其它",
            "公司": "企业",
            "中心": "事业单位",
            "会": "党委机关",
            "所": "事业单位",
            "站": "事业单位",
            "队": "事业单位",
            "院": "事业单位",
            "社": "社团组织",
            "台": "事业单位",
            "宫": "事业单位",
            "馆": "事业单位",
            "党政": "党委机关",
            "监狱": "政府机关",
            "政府": "政府机关"
        },
        "行业类型": {
            "卫生": "卫生",
            "学校": "教育",
            "小学": "教育",
            "中学": "教育",
            "大学": "教育",
            "学院": "教育",
            "贴吧": "经营性公众互联网",
            "学点": "经营性公众互联网",
            "网": "经营性公众互联网",
            "医院": "卫生",
            "政府": "政府部门",
            "委员": "政府部门",
            "通信": "电信",
            "广播": "广电",
            "监狱": "司法",
            "铁路": "铁路",
            "银行": "银行",
            "海关": "海关",
            "税务": "税务",
            "民航": "民航",
            "电力": "电力",
            "证券": "证券",
            "保险": "保险",
            "水务": "水利",
            "公安": "公安",
            "财政": "财政",
            "审计": "审计",
            "贸易": "商业贸易",
            "国土": "国土资源",
            "冶": "能源",
            "统计": "统计",
            "行政管理": "行政管理",
            "邮政": "邮政",
            "教育": "教育",
            "农业": "农业",
            "水利": "水利",
            "文化": "文化",
            "科技": "科技",
            "广告": "宣传",
            "监督检测": "质量监督检验检疫",
            "人力": "人事劳动和社会保障",
            "气象": "统计"
        }
    }

      一些参考链接:

        1.https://code.activestate.com/recipes/279003-converting-word-documents-to-text/

        2.https://stackoverflow.com/questions/1468099/python-win32-extensions-documentation

        3.https://stackoverflow.com/questions/10366596/how-to-read-contents-of-an-table-in-ms-word-file-using-python

        4.https://stackoverflow.com/questions/38468442/multiple-doc-to-docx-file-conversion-using-python

        5.https://www.jianshu.com/p/4fa504c720c1

  • 相关阅读:
    机器任务——最小点覆盖
    树的统计
    农夫约翰
    关押罪犯
    题单
    加分二叉树(递归,区间DP)
    [动态规划] 斜率优化DP
    [树形DP] 换根DP
    [期望DP][SCOI2008] 奖励关
    [数位DP][AHOI2009] Luogu P4127 同类分布
  • 原文地址:https://www.cnblogs.com/darkchii/p/12051950.html
Copyright © 2011-2022 走看看