zoukankan html css js c++ java
python docx通过关键字标注字体以及颜色大小等

主要使用python-docx 与pandas
因为python-docx对表格的解析不够友好且效率低，故需转换一次
代码如下
# coding:utf-8
import os, re
import docx
from docx.document import Document as dc
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
from docx.shared import RGBColor  # 设置字体颜色
from docx import Document
from docx.shared import Pt  # 设置字体
from docx.oxml.ns import qn  # 设置中文字体
import pandas as pd

FILE_PATH = r"D:xxxxxxxxxxxxxxxx.docx"

obj = docx.Document(FILE_PATH)


def iter_block_items(parent):
    # print('utils.py ----> iter_block_items:', 2)
    if isinstance(parent, dc):
        parent_elm = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    else:
        raise ValueError("[TypeError] Document in insuitable type.")

    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)


def table2list(table):
    data = []
    for i, row in enumerate(table.rows):
        row_data = []
        for cell in row.cells:
            row_data.append(cell.text)
        data.append(row_data)
    return data

#替换的段落关键字
word = '段落关键字'
#替换的表格关键字
table_text = '表格关键字'


def set_run(run, font_size, bold, color, name):
    '''
    设置run对象
    :param run:
    :param font_size: 字体大小
    :param bold: 是否加粗
    :param color: 字体颜色
    :param name: 字体名
    :return:
    '''
    run.font.size = font_size
    run.bold = bold
    run.font.color.rgb = color
    run.font.name = name
    # 设置字体必须要下面2步
    s = run._element
    s.rPr.rFonts.set(qn('w:eastAsia'), name)


def paragraphs_utils(obj):
    for p in obj.paragraphs:
        # 先循环得到单个段落p
        for r in p.runs:
            if word not in r.text:
                # 判断关键字是否存在于段落文本中
                continue
            # print(r.text)
            # print(r.style.name)
            font_size = r.font.size
            bold = r.bold
            color = r.font.color.rgb
            name = u'楷体'
            # 使用关键词切分当前run的文本
            rest = r.text.split(word)
            # 清除当前run的内容
            r.text = ''
            for text in rest[:-1]:
                # 循环切割出来的列表 ['','xxxxxxx']或者['xxxxx','']
                run = p.add_run(text=text)
                set_run(run, font_size, bold, color, name)
                run = p.add_run(word)
                # 重写关键字部分
                set_run(run, font_size, bold, color, name)
                run.font.color.rgb = RGBColor(255, 0, 0)
            run = p.add_run(rest[-1])
            # 在补齐r.text的内容
            set_run(run, font_size, bold, color, name)
    obj.save('标注后的文档.docx')


def table_utils(obj):
    for p in obj.tables:
        # 先循环得到单个表格p
        pd_block = pd.DataFrame(table2list(p))
        # 使用table2list 将table转成列表，然后转成pandas的DateFrame对象
        for rows in range(pd_block.shape[0]):
            # 循环pd_block(DateFrame对象)的行数 -》shape方法得到元祖 为行数和列数
            if rows == 0: continue
            if table_text != pd_block.iloc[rows, 0]: continue
            # 判断关键字是否等于当前表的 rows行0列，否则跳过
            for cols in range(pd_block.shape[1]):
                if cols == 0: continue
                rs = p.cell(rows, cols).paragraphs[0]
                # 此时rows和cols肯定为关键字所在的那行数据，用document对象获取paragraphs取0
                for r in rs.runs:  # paragraphs中有个runs   是个列表
                    font_size = r.font.size
                    bold = r.bold
                    color = r.font.color.rgb
                    name = u'楷体'
                    data = r.text.strip()
                    # 清除当前run的内容
                    r.text = ''
                    run = rs.add_run(data)
                    # 此时要使用paragraphs的add_run方法重写data数据
                    set_run(run, font_size, bold, color, name)
                    run.font.color.rgb = RGBColor(255, 0, 0)
    obj.save('标注后的表格.docx')


for block in iter_block_items(obj):
    if isinstance(block, Paragraph):
        for r in block.runs:
            if word not in r.text:
                continue
            print(r.text)
            print(r.style.name)
            font_size = r.font.size
            bold = r.bold
            color = r.font.color.rgb
            name = u'楷体'
            # 使用关键词切分当前run的文本
            rest = r.text.split(word)
            # 清除当前run的内容
            r.text = ''
            for text in rest[:-1]:
                run = block.add_run(text=text)
                set_run(run, font_size, bold, color, name)
                run = block.add_run(word)
                set_run(run, font_size, bold, color, name)
                run.font.color.rgb = RGBColor(255, 0, 0)
            run = block.add_run(rest[-1])
            set_run(run, font_size, bold, color, name)
    else:
        pd_block = pd.DataFrame(table2list(block))
        # 使用table2list 将table转成列表，然后转成pandas的DateFrame对象
        for rows in range(pd_block.shape[0]):
            # 循环pd_block(DateFrame对象)的行数 -》shape方法得到元祖 为行数和列数
            if rows == 0: continue
            if table_text != pd_block.iloc[rows, 0]: continue
            # 判断关键字是否等于当前表的 rows行0列，否则跳过
            for cols in range(pd_block.shape[1]):
                if cols == 0: continue
                rs = block.cell(rows, cols).paragraphs[0]
                # 此时rows和cols肯定为关键字所在的那行数据，用document对象获取paragraphs取0
                for r in rs.runs:  # paragraphs中有个runs   是个列表
                    font_size = r.font.size
                    bold = r.bold
                    color = r.font.color.rgb
                    name = u'楷体'
                    data = r.text.strip()
                    # 清除当前run的内容
                    r.text = ''
                    run = rs.add_run(data)
                    # 此时要使用paragraphs的add_run方法重写data数据
                    set_run(run, font_size, bold, color, name)
                    run.font.color.rgb = RGBColor(255, 0, 0)
                    
obj.save('段落与表格标注后的文档.docx')
匹配关键字回写docx替换颜色
查看全文
相关阅读:
一些业内有名的网站收集
 WCF重载
 FCKEditor fckconfig.js配置，添加字体和大小附：中文字体乱码问题解决
 查询第几条到第几条的数据的SQL语句
 SPOJ 9939 Eliminate the Conﬂict
UVA 10534 Wavio Sequence
HDU 3474 Necklace
POJ 2823 Sliding Window
UVA 437 The Tower of Babylon
UVA 825 Walking on the Safe Side
原文地址：https://www.cnblogs.com/nixindecat/p/12157623.html