zoukankan      html  css  js  c++  java
  • 使用PyPDF2结合pdfminer拆分PDF,并提取关键字重命名拆分出来的文件

    需求:银行汇款回单PDF几十页,每一页包含两个回单。需把每一张回单拆分出来,并且以回单上交易附言处TPPXXXXXXXX格式的流水号重命名拆出来的文件。

    思路:

    1.使用PyPDF2把每一页一分为二,输出PDF到一个目录A。

    2.循环目录A,使用pdfminer提取TPPXXXXXXXX格式的流水号,重命名PDF文件。

    3.使用pyinstaller -F 打包成一个exe文件。(注意:要在C盘打包)CMD: C:Users<用户名>PDF>pyinstaller -F C:UserschendePDFpdftools.py 

    # -*- coding: UTF-8 -*-  
    from PyPDF2 import PdfFileReader, PdfFileWriter
    
    from pdfminer.pdfparser import PDFParser
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdfpage import PDFPage,PDFTextExtractionNotAllowed
    from pdfminer.pdfinterp import PDFResourceManager,PDFPageInterpreter
    from pdfminer.pdfdevice import PDFDevice
    from pdfminer.layout import LAParams, LTTextBox
    from pdfminer.converter import PDFPageAggregator
    
    import re
    import os
    import os.path
    
    #切割PDF
    def split_pdf(infile, out_path):
    
        """
        :param infile: 待拆分的pdf文件
        :param out_path: 拆分成单页的pdf文件的存储路径  
        :return: 无
        """
    
        if not os.path.exists(out_path):
            os.makedirs(out_path)
        with open(infile, 'rb') as infile:
    
            pdfReader = PdfFileReader(infile)
            number_of_pages = pdfReader.getNumPages()  #计算此PDF文件中的页数
            for i in range(number_of_pages):
    
                page = pdfReader.getPage(i)
                width = float(page.mediaBox.getWidth())
                height = float(page.mediaBox.getHeight())
                #print(""+str(width)+" Height"+str(height))
    
                #top page
                pdfReader=PdfFileReader(infile)   #一定要重新读取,要不会报错。
                pdfWriter = PdfFileWriter()    #循环创建空白的pdf 
                page_top = pdfReader.getPage(i)
                page_top.mediaBox.lowerLeft = (0,height/2)
                page_top.mediaBox.lowerRight = (width,height/2)
                page_top.mediaBox.upperLeft = (0,height)
                page_top.mediaBox.upperRight = (width,height)
                pdfWriter.addPage(page_top)
                out_file_name = out_path + str(i+1)+'_top.pdf'
                with open(out_file_name, 'wb') as outfile:
                    pdfWriter.write(outfile) 
       
                #bottom page
                pdfReader=PdfFileReader(infile)   #一定要重新读取,要不会报错。
                pdfWriter = PdfFileWriter()    #循环创建空白的pdf
                page_buttom = pdfReader.getPage(i)
                page_buttom.mediaBox.lowerLeft = (0,0)
                page_buttom.mediaBox.lowerRight = (width,0)
                page_buttom.mediaBox.upperLeft = (0,height/2)
                page_buttom.mediaBox.upperRight = (width,height/2)
        
                pdfWriter.addPage(page_buttom)   
                out_file_name = out_path + str(i+1)+'_bottom.pdf' 
                with open(out_file_name, 'wb') as outfile:
                    pdfWriter.write(outfile) 
    
        infile.close()
        outfile.close()
    
    #重命名PDF
    def extractPDF(out_Path):
        for parent,dirnames,filenames in os.walk(out_Path):     #三个参数:分别返回1.父目录 2.所有文件夹名字(不含路径) 3.所有文件名字
            for filename in filenames:     
                #print(filename)                 #文件名
                #os.rename(os.path.join(parent,filename),os.path.join(parent,filename[:-4]+'.black.png')) #重命名
    
                # Open a PDF file.
                fp = open(out_Path + filename, 'rb')
                # Create a PDF parser object associated with the file object.
                parser = PDFParser(fp)
                # Create a PDF document object that stores the document structure.
                # Supply the password for initialization.
                #document = PDFDocument(parser,password)
                document = PDFDocument(parser)
                # Check if the document allows text extraction. If not, abort.
                if not document.is_extractable:
                    raise PDFTextExtractionNotAllowed
                # Create a PDF resource manager object that stores shared resources.
                rsrcmgr = PDFResourceManager()
                # Create a PDF device object.
                device = PDFDevice(rsrcmgr)
                # Set parameters for analysis.
                laparams = LAParams()
                # Create a PDF page aggregator object.
                device = PDFPageAggregator(rsrcmgr, laparams=laparams)
                # Create a PDF interpreter object.
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                # Process each page contained in the document.
                TXT = []
                for page in PDFPage.create_pages(document):
                    interpreter.process_page(page)
                    # receive the LTPage object for the page.
                    layout = device.get_result()
                    for x in layout:
                        if isinstance(x, LTTextBox):
                            #print(x.get_text().strip())    #strip()去空行
                            searchObj = re.search(r'(TPP*d{8})', x.get_text().strip(), flags=0)
                            if searchObj:
                                print(searchObj.group())
                                TXT.append(searchObj.group())
                fp.close() #注意要关闭,要不报错
                device.close()
    
                if TXT:
                    #print(list(set(TXT))[0])
                    NewFileName = list(set(TXT))[0]
                    os.rename(out_Path + filename,out_Path + NewFileName+".pdf") #重命名 
    
    
    if __name__ == '__main__':
        in_File = './PDFfile.pdf'
        out_Path = './Single/'  # 生成输出文件夹
        split_pdf(in_File, out_Path)                             
        extractPDF(out_Path)    # 指明被遍历的文件夹
        
  • 相关阅读:
    Pandas数据分析 (三)
    Pandas数据分析 (二)
    Pandas数据分析 (一)
    Django后台应用管理名称修改
    Jupyter Notebooks 配置
    Hadoop完全分布式搭建流程
    Java学习笔记(五)
    微信小程序学习笔记(一)
    redis windows版安装
    Linux计划任务
  • 原文地址:https://www.cnblogs.com/mysick/p/12709179.html
Copyright © 2011-2022 走看看