#-*- coding:gbk -*- import os import docx from win32com import client as wc import xlwt import xlsxwriter # 获取filepath文件夹下的所有的文件 def getfilelist(filepath): filelist = os.listdir(filepath) files = [] for i in range(len(filelist)): child = os.path.join('%s\%s' % (filepath, filelist[i])) if os.path.isdir(child): files.extend(getfilelist(child)) else: files.append(child) return files # 获取word文件文本 def getDocx(fileName): d = docx.opendocx(fileName) doc = docx.getdocumenttext(d) return doc # 将doc转换为docx def doc2Docx(fileName): word = wc.Dispatch("Word.Application") doc = word.Documents.Open(fileName) doc.SaveAs(fileName + "x", 12, False, "", True, "", False, False, False, False) os.remove(fileName) doc.Close() word.Quit() filepath = "C:\xxx\xx\xx\xx\数据集" filelist = (getfilelist(filepath)) ##如果文件夹下的文件都是doc,需要先通过该函数全部转变为docx ##for i in range(len(filelist)): ## doc2Docx(filelist[i]) list = [] for i in range(len(filelist)): if (filelist[i].endswith("docx")): list.append(filelist[i]) # 使用xlwt写入到excel,当存在大文本的时候会出现错误:Exception: String longer than 32767 characters ##for i in range(len(list)): ## fileName = list[i] ## doc = get_docx(fileName) ## filePaths = fileName.split("\") ## string = "" ## for j in range(len(doc)): ## string += doc[j] + " " ## if (len(string) > 10000): ## string = string[:10000] ## filePaths.append(string) ## for j in range(20, -1, -1): ## if j < len(filePaths): ## worksheet.write(i, j, label = filePaths[j]) ##workbook.save('Excel_Workbook.xls') # 使用xlsxwriter处理超过的32767word文本 workbook = xlsxwriter.Workbook(u'数据.xlsx') worksheet = workbook.add_worksheet(u"数据") for i in range(len(list)): fileName = list[i] doc = get_docx(fileName) filePaths = fileName.split("\") string = "" for j in range(len(doc)): string += doc[j] + " " filePaths.append(string) for j in range(20, -1, -1): if j < len(filePaths): worksheet.write(i, j, filePaths[j]) workbook.close()