需求是有一堆这样的word文档,要转换成试题,供web界面使用。
#!/usr/bin/env python3
import docx
import re
import json
file = docx.Document("./2018《廉洁自律准则》知识竞赛试题及答案.docx")
list = []
val = None
for para in file.paragraphs:
v = para.text.split()
for line in v:
items = re.compile(".").split(line)
if (re.match('d', line)):
if len(items) > 1:
val = {'no': items[0]}
q = ''.join([str(x) for x in items[1:]])
key = re.search('(?<=((|())s*[A-D]*', q)
if key != None:
val['k'] = key.group(0).lstrip()
val['q'] = re.sub('(?<=((|())s*[A-D]*s*', ' ', q)
list.append(val)
if (re.match('A', line)):
if len(items) > 1:
val['a'] = ''.join([str(x) for x in items[1:]])
if (re.match('B', line)):
if len(items) > 1:
val['b'] = ''.join([str(x) for x in items[1:]])
if (re.match('C', line)):
if len(items) > 1:
val['c'] = ''.join([str(x) for x in items[1:]])
if (re.match('D', line)):
if len(items) > 1:
val['d'] = ''.join([str(x) for x in items[1:]])
with open('data.json', 'w') as outfile:
json.dump(list, outfile, ensure_ascii=False)
转换过程并不完美,因为word文档并非标准,大约有90%左右的没有问题,还有部分是有问题的。