zoukankan      html  css  js  c++  java
  • 使用python读取多重文件夹下的word(doc、docx)文件,并处理存储到excel(xls、xlsx)文件

    #-*- coding:gbk -*-
    import os
    import docx
    from win32com import client as wc
    import xlwt
    import xlsxwriter
    
    # 获取filepath文件夹下的所有的文件
    def getfilelist(filepath):
        filelist =  os.listdir(filepath)  
        files = []
        for i in range(len(filelist)):
            child = os.path.join('%s\%s' % (filepath, filelist[i]))
            if os.path.isdir(child):
                files.extend(getfilelist(child))
            else:
                files.append(child)
        return files
    
    # 获取word文件文本
    def getDocx(fileName):
        d = docx.opendocx(fileName)
        doc = docx.getdocumenttext(d)
        return doc
    
    # 将doc转换为docx
    def doc2Docx(fileName):
        word = wc.Dispatch("Word.Application")
        doc = word.Documents.Open(fileName)
        doc.SaveAs(fileName + "x", 12, False, "", True, "", False, False, False, False)
        os.remove(fileName)
        doc.Close()
        word.Quit()
    
    filepath = "C:\xxx\xx\xx\xx\数据集"
    filelist = (getfilelist(filepath))
    ##如果文件夹下的文件都是doc,需要先通过该函数全部转变为docx
    ##for i in range(len(filelist)):
    ##    doc2Docx(filelist[i])
        
    
    list = []
    for i in range(len(filelist)):
        if (filelist[i].endswith("docx")):
            list.append(filelist[i])
         
    # 使用xlwt写入到excel,当存在大文本的时候会出现错误:Exception: String longer than 32767 characters
    ##for i in range(len(list)):
    ##    fileName = list[i]
    ##    doc = get_docx(fileName)
    ##    filePaths = fileName.split("\")
    ##    string = ""
    ##    for j in range(len(doc)):
    ##        string += doc[j] + "
    "
    ##    if (len(string) > 10000):
    ##        string = string[:10000]
    ##    filePaths.append(string)
    ##    for j in range(20, -1, -1):
    ##        if j < len(filePaths):
    ##            worksheet.write(i, j, label = filePaths[j])
    ##workbook.save('Excel_Workbook.xls')
    
    # 使用xlsxwriter处理超过的32767word文本
    workbook = xlsxwriter.Workbook(u'数据.xlsx') 
    worksheet = workbook.add_worksheet(u"数据")
    for i in range(len(list)):
        fileName = list[i]
        doc = get_docx(fileName)
        filePaths = fileName.split("\")
        string = ""
        for j in range(len(doc)):
            string += doc[j] + "
    "
        filePaths.append(string)
        for j in range(20, -1, -1):
            if j < len(filePaths):
                worksheet.write(i, j, filePaths[j])  
    workbook.close()
  • 相关阅读:
    【Linux设备驱动程序】Chapter 2
    【Linux设备驱动程序】Chapter 1
    sed 命令多行到多行的定位方式
    chmod 与大写 X
    C 语言中模板的几种实现方式
    /etc/default/grub 部分配置选项设置
    fcitx error
    QT5学习过程的小问题集锦
    Qt4编码
    Qt MainWindow结构
  • 原文地址:https://www.cnblogs.com/sixu/p/10104752.html
Copyright © 2011-2022 走看看