zoukankan      html  css  js  c++  java
  • python_读取 doc,docx,pdf

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    import docx
    
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfpage import PDFPage
    from cStringIO import StringIO
    
    from win32com import client
    import sys
    reload(sys)
    sys.setdefaultencoding('gb2312')
    
    def readDocx(docxPath):
        fullText = []
        doc = docx.Document(docxPath)
        paras = doc.paragraphs
        for p in paras:
            fullText.append(p.text.strip())
        return '
    '.join(fullText)
    def readPdf(pdfPath):
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = file(pdfPath, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos=set()
        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
            interpreter.process_page(page)
        fp.close()
        device.close()
        str = retstr.getvalue()
        retstr.close()
        return str
    def readDoc(docPath):
        fullText = []
        word = client.Dispatch('Word.Application')    
        # 打开一个已存在的文件
        doc = word.Documents.Open(docPath)
        #print doc.Content
        #print text
        doc.SaveAs('c:/temp.txt', 2)
        # 关闭
        doc.Close()
        word.Quit()
        f=open(r'c:/temp.txt','r')  
        for line in f.readlines(): 
            #f len(line)!=line.count('
    '):
            fullText.append(line.decode('gbk').strip())
        f.close()
        return '
    '.join(fullText)
    if __name__ == '__main__':
        #docxValue=readDocx('d:/1.docx')
        #print docxValue
        #pdfValue = readPdf('d:/3.pdf')
        #print pdfValue
        docValue = readDoc('d:/2.doc')
        print docValue
  • 相关阅读:
    vue-cil和webpack中本地静态图片的路径问题解决方案
    vue-cil 中的配置分析
    webpack中mainifest.js vendor.js app.js 三者的区别
    css 中可以继承的属性
    有关正则表达式的Js方法(replace)
    css 常用的几种垂直居中(包括图片)
    如何在Vue中建立全局引用或者全局命令
    删除数组中多个不连续的数组元素的正确姿势
    数据库
    代码片段
  • 原文地址:https://www.cnblogs.com/zy900406/p/6654017.html
Copyright © 2011-2022 走看看