zoukankan      html  css  js  c++  java
  • python_读取 doc,docx,pdf

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    import docx
    
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfpage import PDFPage
    from cStringIO import StringIO
    
    from win32com import client
    import sys
    reload(sys)
    sys.setdefaultencoding('gb2312')
    
    def readDocx(docxPath):
        fullText = []
        doc = docx.Document(docxPath)
        paras = doc.paragraphs
        for p in paras:
            fullText.append(p.text.strip())
        return '
    '.join(fullText)
    def readPdf(pdfPath):
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = file(pdfPath, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos=set()
        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
            interpreter.process_page(page)
        fp.close()
        device.close()
        str = retstr.getvalue()
        retstr.close()
        return str
    def readDoc(docPath):
        fullText = []
        word = client.Dispatch('Word.Application')    
        # 打开一个已存在的文件
        doc = word.Documents.Open(docPath)
        #print doc.Content
        #print text
        doc.SaveAs('c:/temp.txt', 2)
        # 关闭
        doc.Close()
        word.Quit()
        f=open(r'c:/temp.txt','r')  
        for line in f.readlines(): 
            #f len(line)!=line.count('
    '):
            fullText.append(line.decode('gbk').strip())
        f.close()
        return '
    '.join(fullText)
    if __name__ == '__main__':
        #docxValue=readDocx('d:/1.docx')
        #print docxValue
        #pdfValue = readPdf('d:/3.pdf')
        #print pdfValue
        docValue = readDoc('d:/2.doc')
        print docValue
  • 相关阅读:
    两个单链表的第一个公共节点
    对于混沌定义中三个条件的理解
    sort()函数使用详解
    C++优先队列详解
    第一次只出现一个的字符
    丑数
    把数组排成最小的数
    从1到n整数中1的个数
    git相关知识
    文件处理(如果文件存在则追加,不存在则生成多级文件夹以及txt目录)
  • 原文地址:https://www.cnblogs.com/zy900406/p/6654017.html
Copyright © 2011-2022 走看看