zoukankan      html  css  js  c++  java
  • python docx通过关键字标注字体以及颜色大小等

    主要使用python-docx 与pandas

    因为python-docx对表格的解析不够友好且效率低,故需转换一次

    代码如下

    # coding:utf-8
    import os, re
    import docx
    from docx.document import Document as dc
    from docx.oxml.table import CT_Tbl
    from docx.oxml.text.paragraph import CT_P
    from docx.table import _Cell, Table
    from docx.text.paragraph import Paragraph
    from docx.shared import RGBColor  # 设置字体颜色
    from docx import Document
    from docx.shared import Pt  # 设置字体
    from docx.oxml.ns import qn  # 设置中文字体
    import pandas as pd
    
    FILE_PATH = r"D:xxxxxxxxxxxxxxxx.docx"
    
    obj = docx.Document(FILE_PATH)
    
    
    def iter_block_items(parent):
        # print('utils.py ----> iter_block_items:', 2)
        if isinstance(parent, dc):
            parent_elm = parent.element.body
        elif isinstance(parent, _Cell):
            parent_elm = parent._tc
        else:
            raise ValueError("[TypeError] Document in insuitable type.")
    
        for child in parent_elm.iterchildren():
            if isinstance(child, CT_P):
                yield Paragraph(child, parent)
            elif isinstance(child, CT_Tbl):
                yield Table(child, parent)
    
    
    def table2list(table):
        data = []
        for i, row in enumerate(table.rows):
            row_data = []
            for cell in row.cells:
                row_data.append(cell.text)
            data.append(row_data)
        return data
    
    #替换的段落关键字
    word = '段落关键字'
    #替换的表格关键字
    table_text = '表格关键字'
    
    
    def set_run(run, font_size, bold, color, name):
        '''
        设置run对象
        :param run:
        :param font_size: 字体大小
        :param bold: 是否加粗
        :param color: 字体颜色
        :param name: 字体名
        :return:
        '''
        run.font.size = font_size
        run.bold = bold
        run.font.color.rgb = color
        run.font.name = name
        # 设置字体必须要下面2步
        s = run._element
        s.rPr.rFonts.set(qn('w:eastAsia'), name)
    
    
    def paragraphs_utils(obj):
        for p in obj.paragraphs:
            # 先循环得到单个段落p
            for r in p.runs:
                if word not in r.text:
                    # 判断关键字是否存在于段落文本中
                    continue
                # print(r.text)
                # print(r.style.name)
                font_size = r.font.size
                bold = r.bold
                color = r.font.color.rgb
                name = u'楷体'
                # 使用关键词切分当前run的文本
                rest = r.text.split(word)
                # 清除当前run的内容
                r.text = ''
                for text in rest[:-1]:
                    # 循环切割出来的列表 ['','xxxxxxx']或者['xxxxx','']
                    run = p.add_run(text=text)
                    set_run(run, font_size, bold, color, name)
                    run = p.add_run(word)
                    # 重写关键字部分
                    set_run(run, font_size, bold, color, name)
                    run.font.color.rgb = RGBColor(255, 0, 0)
                run = p.add_run(rest[-1])
                # 在补齐r.text的内容
                set_run(run, font_size, bold, color, name)
        obj.save('标注后的文档.docx')
    
    
    def table_utils(obj):
        for p in obj.tables:
            # 先循环得到单个表格p
            pd_block = pd.DataFrame(table2list(p))
            # 使用table2list 将table转成列表,然后转成pandas的DateFrame对象
            for rows in range(pd_block.shape[0]):
                # 循环pd_block(DateFrame对象)的行数 -》shape方法得到元祖 为行数和列数
                if rows == 0: continue
                if table_text != pd_block.iloc[rows, 0]: continue
                # 判断关键字是否等于当前表的 rows行0列,否则跳过
                for cols in range(pd_block.shape[1]):
                    if cols == 0: continue
                    rs = p.cell(rows, cols).paragraphs[0]
                    # 此时rows和cols肯定为关键字所在的那行数据,用document对象获取paragraphs取0
                    for r in rs.runs:  # paragraphs中有个runs   是个列表
                        font_size = r.font.size
                        bold = r.bold
                        color = r.font.color.rgb
                        name = u'楷体'
                        data = r.text.strip()
                        # 清除当前run的内容
                        r.text = ''
                        run = rs.add_run(data)
                        # 此时要使用paragraphs的add_run方法重写data数据
                        set_run(run, font_size, bold, color, name)
                        run.font.color.rgb = RGBColor(255, 0, 0)
        obj.save('标注后的表格.docx')
    
    
    for block in iter_block_items(obj):
        if isinstance(block, Paragraph):
            for r in block.runs:
                if word not in r.text:
                    continue
                print(r.text)
                print(r.style.name)
                font_size = r.font.size
                bold = r.bold
                color = r.font.color.rgb
                name = u'楷体'
                # 使用关键词切分当前run的文本
                rest = r.text.split(word)
                # 清除当前run的内容
                r.text = ''
                for text in rest[:-1]:
                    run = block.add_run(text=text)
                    set_run(run, font_size, bold, color, name)
                    run = block.add_run(word)
                    set_run(run, font_size, bold, color, name)
                    run.font.color.rgb = RGBColor(255, 0, 0)
                run = block.add_run(rest[-1])
                set_run(run, font_size, bold, color, name)
        else:
            pd_block = pd.DataFrame(table2list(block))
            # 使用table2list 将table转成列表,然后转成pandas的DateFrame对象
            for rows in range(pd_block.shape[0]):
                # 循环pd_block(DateFrame对象)的行数 -》shape方法得到元祖 为行数和列数
                if rows == 0: continue
                if table_text != pd_block.iloc[rows, 0]: continue
                # 判断关键字是否等于当前表的 rows行0列,否则跳过
                for cols in range(pd_block.shape[1]):
                    if cols == 0: continue
                    rs = block.cell(rows, cols).paragraphs[0]
                    # 此时rows和cols肯定为关键字所在的那行数据,用document对象获取paragraphs取0
                    for r in rs.runs:  # paragraphs中有个runs   是个列表
                        font_size = r.font.size
                        bold = r.bold
                        color = r.font.color.rgb
                        name = u'楷体'
                        data = r.text.strip()
                        # 清除当前run的内容
                        r.text = ''
                        run = rs.add_run(data)
                        # 此时要使用paragraphs的add_run方法重写data数据
                        set_run(run, font_size, bold, color, name)
                        run.font.color.rgb = RGBColor(255, 0, 0)
                        
    obj.save('段落与表格标注后的文档.docx')

    匹配关键字回写docx替换颜色

  • 相关阅读:
    一些业内有名的网站收集
    WCF重载
    FCKEditor fckconfig.js配置,添加字体和大小 附:中文字体乱码问题解决
    查询第几条到第几条的数据的SQL语句
    SPOJ 9939 Eliminate the Conflict
    UVA 10534 Wavio Sequence
    HDU 3474 Necklace
    POJ 2823 Sliding Window
    UVA 437 The Tower of Babylon
    UVA 825 Walking on the Safe Side
  • 原文地址:https://www.cnblogs.com/nixindecat/p/12157623.html
Copyright © 2011-2022 走看看