zoukankan      html  css  js  c++  java
  • python_读取 doc,docx,pdf

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    import docx
    
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfpage import PDFPage
    from cStringIO import StringIO
    
    from win32com import client
    import sys
    reload(sys)
    sys.setdefaultencoding('gb2312')
    
    def readDocx(docxPath):
        fullText = []
        doc = docx.Document(docxPath)
        paras = doc.paragraphs
        for p in paras:
            fullText.append(p.text.strip())
        return '
    '.join(fullText)
    def readPdf(pdfPath):
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = file(pdfPath, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos=set()
        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
            interpreter.process_page(page)
        fp.close()
        device.close()
        str = retstr.getvalue()
        retstr.close()
        return str
    def readDoc(docPath):
        fullText = []
        word = client.Dispatch('Word.Application')    
        # 打开一个已存在的文件
        doc = word.Documents.Open(docPath)
        #print doc.Content
        #print text
        doc.SaveAs('c:/temp.txt', 2)
        # 关闭
        doc.Close()
        word.Quit()
        f=open(r'c:/temp.txt','r')  
        for line in f.readlines(): 
            #f len(line)!=line.count('
    '):
            fullText.append(line.decode('gbk').strip())
        f.close()
        return '
    '.join(fullText)
    if __name__ == '__main__':
        #docxValue=readDocx('d:/1.docx')
        #print docxValue
        #pdfValue = readPdf('d:/3.pdf')
        #print pdfValue
        docValue = readDoc('d:/2.doc')
        print docValue
  • 相关阅读:
    CentOS 7拨号上网(ADSL & PPPoE)
    linux使用nmcli重新生成网卡配置文件
    Linux 内存缓存占用过大,Centos7设置定时清除buff/cache的脚本
    部署redis6.0 常见问题
    ssh 升级导致的hadoop 主备切换失败
    配置zookeeper的 ACL权限
    sqoop 创建跟mysql相同表结构的hive表报错
    vim中显示不可见字符
    supervisor 使用
    使用hive streaming 统计Wordcount
  • 原文地址:https://www.cnblogs.com/zy900406/p/6654017.html
Copyright © 2011-2022 走看看