zoukankan      html  css  js  c++  java
  • python_读取 doc,docx,pdf

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    import docx
    
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfpage import PDFPage
    from cStringIO import StringIO
    
    from win32com import client
    import sys
    reload(sys)
    sys.setdefaultencoding('gb2312')
    
    def readDocx(docxPath):
        fullText = []
        doc = docx.Document(docxPath)
        paras = doc.paragraphs
        for p in paras:
            fullText.append(p.text.strip())
        return '
    '.join(fullText)
    def readPdf(pdfPath):
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = file(pdfPath, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos=set()
        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
            interpreter.process_page(page)
        fp.close()
        device.close()
        str = retstr.getvalue()
        retstr.close()
        return str
    def readDoc(docPath):
        fullText = []
        word = client.Dispatch('Word.Application')    
        # 打开一个已存在的文件
        doc = word.Documents.Open(docPath)
        #print doc.Content
        #print text
        doc.SaveAs('c:/temp.txt', 2)
        # 关闭
        doc.Close()
        word.Quit()
        f=open(r'c:/temp.txt','r')  
        for line in f.readlines(): 
            #f len(line)!=line.count('
    '):
            fullText.append(line.decode('gbk').strip())
        f.close()
        return '
    '.join(fullText)
    if __name__ == '__main__':
        #docxValue=readDocx('d:/1.docx')
        #print docxValue
        #pdfValue = readPdf('d:/3.pdf')
        #print pdfValue
        docValue = readDoc('d:/2.doc')
        print docValue
  • 相关阅读:
    超有爱的并查集
    写给想当程序员的朋友
    POJ 1961 字符串 KMP (i-next[i])
    POJ 2406 KMP算法next数组理解
    POJ 2387 Bellman双重边
    POJ 1917 字符串替换
    POJ 1062 坑爹的聘礼(枚举等级差选择性找边)
    Linux下libxml2的使用
    浙大pat 1003
    判定一棵二叉树是否是二叉搜索树
  • 原文地址:https://www.cnblogs.com/zy900406/p/6654017.html
Copyright © 2011-2022 走看看