zoukankan      html  css  js  c++  java
  • python——提取pdf(将pdf文件转成txt)

    # -*- coding: utf-8 -*-   
    # from pdfminer.pdfparser import PDFParser
    # from pdfminer.pdfdocument import PDFDocument
    # from pdfminer.pdfpage import PDFPage
    # from pdfminer.pdfpage import PDFTextExtractionNotAllowed
    # from pdfminer.pdfinterp import PDFResourceManager
    # from pdfminer.pdfinterp import PDFPageInterpreter
    # from pdfminer.pdfdevice import PDFDevice
    # from pdfminer.layout import *
    # from pdfminer.converter import PDFPageAggregator
    # import os
    # # os.chdir(r'F:	est')
    # fp = open('s.pdf', 'rb')
    # #来创建一个pdf文档分析器
    # parser = PDFParser(fp)  
    # #创建一个PDF文档对象存储文档结构
    # document = PDFDocument(parser)
    # # 检查文件是否允许文本提取
    # if not document.is_extractable:
    #     raise PDFTextExtractionNotAllowed
    # else:
    #     # 创建一个PDF资源管理器对象来存储共赏资源
    #     rsrcmgr=PDFResourceManager()
    #     # 设定参数进行分析
    #     laparams=LAParams()
    #     # 创建一个PDF设备对象
    #     # device=PDFDevice(rsrcmgr)
    #     device=PDFPageAggregator(rsrcmgr,laparams=laparams)
    #     # 创建一个PDF解释器对象
    #     interpreter=PDFPageInterpreter(rsrcmgr,device)
    #     # 处理每一页
    #     for page in PDFPage.create_pages(document):
    #         interpreter.process_page(page)
    #         # 接受该页面的LTPage对象
    #         layout=device.get_result()
    #         for x in layout:
    #             if(isinstance(x,LTTextBoxHorizontal)):
    #                 with open('h.txt','w') as f:
    #                     f.write(str(x.get_text().encode('utf-8'))+"
    ")
    # print("process done")
    
    #_*_coding:utf-8_*_
     
    from pdfminer.pdfparser import  PDFParser
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdfpage import PDFPage
    from pdfminer.pdfdevice import PDFDevice
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.layout import LTTextBoxHorizontal,LAParams
    from pdfminer.pdfpage import PDFTextExtractionNotAllowed
     
     
    def parse(Path,Save_name):
        
        parser = PDFParser(Path)
        document = PDFDocument(parser)
      
     
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        else:
            rsrcmgr = PDFResourceManager()
            laparams = LAParams()
            device = PDFPageAggregator(rsrcmgr,laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr,device)
     
            for page in PDFPage.create_pages(document):
                interpreter.process_page(page)
                layout = device.get_result()
                for x in layout:
                    if(isinstance(x,LTTextBoxHorizontal)):
                        with open('%s'%(Save_name),'a') as f:
                            # results = x.get_text().encode('gbk')
                            results = x.get_text()
                            f.write(results)
     
    if __name__ == '__main__':
        Path = open('s.pdf','rb')
        parse(Path,'1.txt')
    import re
    file = open("all.txt")
    lines = file.readlines()
    get_lens = "no"
    thinkless_index = ""
    fw = open("提取出来的值2.txt",'a')
    for index,line in enumerate(lines):
        if re.search(r'Sd_ddd',line):
            # print(line)
            # print(index)
            line = line.rstrip("
    ")
            fw.write(line+" : ")
            get_lens = "yes"
        if (get_lens == "yes"):
            if (re.search("^[0-9]dd$",line)):
                fw.write(line)
                get_lens = "no"
            # if re.search("^RESULT$",line):
            #     # print("RESULT",index)
            #     thinkless_index = index + 4
            # if (index == thinkless_index):
            #     # print(line)
            #     fw.write(line)
    
            #     thinkless_index = ""
    file.close()
    fw.close()
        
  • 相关阅读:
    那些年,学swift踩过的坑
    JAVA经BigDecimal圆角的解决方案及注意事项
    Jquery简介选择的
    Oracle性能优化顺序表名称来选择最有效的学习笔记
    jQuery Validate插入 reomte使用详细的说明
    公钥私人 ssh避password登陆
    Telnet,SSH1,SSH2,Telnet/SSL,Rlogin,Serial,TAPI,RAW
    mysqlbackup 还原特定的表
    OpenJDK 阅读源代码 Java 实现字节流输入类
    Android Application Thread CPU GC Operatiing and OOM Question 0603-随手笔记
  • 原文地址:https://www.cnblogs.com/shunguo/p/14533230.html
Copyright © 2011-2022 走看看