zoukankan      html  css  js  c++  java
  • pdfminer批量处理PDF文件

    from pdfminer.pdfparser import PDFParser, PDFDocument
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, PDFTextExtractionNotAllowed
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTTextLineHorizontal, LTFigure, LTRect, LTLine, LTCurve
    import os
    
    
    class PdfForString(object):
        def __init__(self):
            self.pdf_list = os.listdir(r'E:StockExchangePDF')  # 获取PDF文件夹中所有pdf名称
            #  存储文档资源
            self.src = PDFResourceManager()
            #  设备对象
            self.device = PDFPageAggregator(self.src, laparams=LAParams())
            # 解释器对象
            self.inter = PDFPageInterpreter(self.src, self.device)
    
        # 生成pdf路径
        def for_string(self):
            for pdf in self.pdf_list:
                pdf_path = os.path.join(os.path.dirname(os.path.dirname(__file__)) + '/PDF', pdf)
                yield pdf_path
    
        # 解析pdf
        def pdf_analysis(self):
            for path in self.for_string():
                pd_file = open(path, 'rb')
                parser = PDFParser(pd_file)  # pdf文件解析对象
    
                #  pdf文档对象
                document = PDFDocument()
                parser.set_document(document)
                document.set_parser(parser)
                pages = document.get_pages()
                yield pages
    
        # 获取PDF信息
        def get_string(self):
            for pages in self.pdf_analysis():
                for page in pages:
                    self.inter.process_page(page)
                    layout = self.device.get_result()
                    for x in layout:
                        if isinstance(x, LTTextBoxHorizontal):
                            print(str(x.get_text()))
                # break
    
    
    PdfForString().get_string()
    

      

  • 相关阅读:
    phpstudy apache无法启动的解决办法
    Windows server 2008 快速搭建域环境
    使用netsh来进行端口转发
    基于docker构建测试环境
    [Leetcode] Linked List Cycle
    Hyper-V下安装Ossim系统
    Windows应用替代方案接龙
    图解CISCO 3550忘记密码解决方法
    硬件代理解决用户上网问题
    图->存储结构->数组表示法(邻接矩阵)
  • 原文地址:https://www.cnblogs.com/wangtaobiu/p/11947133.html
Copyright © 2011-2022 走看看