zoukankan      html  css  js  c++  java
  • python自动化之PDF

    ###################################处理PDF和Word文档###################################

    '''

    PDF和Word文档是二进制文件,除了文本之外,

    它们还保存了许多字体、颜色和布局信息

    '''

    '''

    从PDF提取文本

    '''

    ###################################从PDF提取文本###################################

    import PyPDF2

    pdfFileObj=open(r'C:UsersAdministratorDesktop est.pdf','rb')

    pdfReader=PyPDF2.PdfFileReader(pdfFileObj)

    pdfReader.numPages

    pageObj=pdfReader.getPage(0)

    pageObj.extractText()

    ###################################解压PDF#########################################

    import PyPDF2

    pdfReader=PyPDF2.PdfFileReader(open(r'C:UsersAdministratorDesktop est.pdf','rb'))

    pdfReader.isEncrypted    ####是否加密

    pdfReader.getPage(0)

    pdfReader.decrypt('rosebud')   ####提供解密口令

    pageObj=pdfReader.getPage(0)

    ###################################创建PDF#########################################

    '''

    PyPDF2不能将任意文本写入PDF:

    PyPDF2写入PDF的能力,仅限于从其他PDF中拷贝页面、旋转页面、重叠页面和加密文件

    '''

    '''

    一般方式:

    1、打开一个或多个已用的PDF(源PDF),得到PdfFileReader对象

    2、创建一个新的PdfFileWriter对象

    3、将页面从PdfFileReader对象拷贝到PdfFileWriter对象中

    4、利用PdfFileWriter对象写入输出的PDF

    '''

    #####################################################拷贝页面###########################################################

    def merge(pdf_one, pdf_two, filename='my.pdf',output_dir=r'C:UsersAdministratorDesktop'):

             input_one = file(pdf_one, 'rb')

             input_two = file(pdf_two, 'rb')

             pdf_input_one = PyPDF2.PdfFileReader(input_one)

             pdf_input_two = PyPDF2.PdfFileReader(input_two)

             numOne = pdf_input_one.getNumPages()

             numTwo = pdf_input_two.getNumPages()

             print numOne, numTwo

             pdf_output = PyPDF2.PdfFileWriter()

             for pageNum in range(numOne):

                       print 'hereo'

                       pageObj=pdf_input_one.getPage(pageNum)

                       pdf_output.addPage(pageObj)

             for pageNum in range(numTwo):

                       print 'heret'

                       pageObj=pdf_input_two.getPage(pageNum)

                       pdf_output.addPage(pageObj)

             pdf_name = output_dir+filename

             print pdf_name

             output_stream = file( pdf_name,'wb')

             pdf_output.write(output_stream)

             output_stream.close()

             input_one.close()

             input_two.close()

             print 'Done!'

    merge(r'C:UsersAdministratorDesktopPairs_Trading_Quantitative Methods and Analysis.pdf',r'C:UsersAdministratorDesktopdeMontjoye.SM.pdf')

    #####################################################旋转页面###########################################################

    '''

    利用rotateClockwise()和rotateCounterClockwise()方法

    PDF文档的页面也可以旋转90度的整数倍,向这些方法传入

    整数90、180或270

    '''

    def merge(pdf_one, pdf_two, filename='my.pdf',output_dir=r'C:UsersAdministratorDesktop'):

             input_one = file(pdf_one, 'rb')

             input_two = file(pdf_two, 'rb')

             pdf_input_one = PyPDF2.PdfFileReader(input_one)

             pdf_input_two = PyPDF2.PdfFileReader(input_two)

             numOne = pdf_input_one.getNumPages()

             numTwo = pdf_input_two.getNumPages()

             print numOne, numTwo

             pdf_output = PyPDF2.PdfFileWriter()

             for pageNum in range(numOne):

                       print 'hereo'

                       pageObj=pdf_input_one.getPage(pageNum)

                       pageObj=pageObj.rotateClockwise(90)

                       pdf_output.addPage(pageObj)

             for pageNum in range(numTwo):

                       print 'heret'

                       pageObj=pdf_input_two.getPage(pageNum)

                       pageObj=pageObj.rotateClockwise(90)

                       pdf_output.addPage(pageObj)

             pdf_name = output_dir+filename

             print pdf_name

             output_stream = file( pdf_name,'wb')

             pdf_output.write(output_stream)

             output_stream.close()

             input_one.close()

             input_two.close()

             print 'Done!'

    merge(r'C:UsersAdministratorDesktopPairs_Trading_Quantitative Methods and Analysis.pdf',r'C:UsersAdministratorDesktopdeMontjoye.SM.pdf')

    #####################################################叠加页面###########################################################

    import PyPDF2

    minutesFile=open(r'C:UsersAdministratorDesktopPairs_Trading_Quantitative Methods and Analysis.pdf','rb')

    pdfReader=PyPDF2.PdfFileReader(minutesFile)

    minutesFirstPage=pdfReader.getPage(0)

    pdfWatermarkReader=PyPDF2.PdfFileReader(open(r'C:UsersAdministratorDesktopdeMontjoye.SM.pdf','rb'))

    minutesFirstPage.mergePage(pdfWatermarkReader.getPage(0))

    pdfWriter=PyPDF2.PdfFileWriter()

    pdfWriter.addPage(minutesFirstPage)

    for pageNum in range(1,pdfReader.numPages):

             pageObj=pdfReader.getPage(pageNum)

             pdfWriter.addPage(pageObj)

    resultPdfFile=open(r'C:UsersAdministratorDesktopmerge.pdf','wb')

    pdfWriter.write(resultPdfFile)

    minutesFile.close()

    resultPdfFile.close()

    #####################################################加密PDF###########################################################

    import PyPDF2

    pdfFile=file(r'C:UsersAdministratorDesktopdeMontjoye.SM.pdf','rb')

    pdfReader=PyPDF2.PdfFileReader(pdfFile)

    pdfWriter=PyPDF2.PdfFileWriter()

    for pageNum in range(pdfReader.numPages):

             pdfWriter.addPage(pdfReader.getPage(pageNum))

    pdfWriter.encrypt('swordfish')

    resultPdf=file(r'C:UsersAdministratorDesktop .pdf','wb')

    pdfWriter.write(resultPdf)

    resultPdf.close()

  • 相关阅读:
    Fix Installing .NET Framework 3.5 failed Error Code 0x800F0954 on Windows 10
    RHEL8安装五笔输入法
    Enable EPEL and Local Repository on RHEL8
    Why is Yum Replaced by DNF?
    检查Linux服务器是否被攻击的常用命令及方法
    IDEA 主题
    IDEA 如何显示一个类中所有的方法
    Appium 安装以及安装过程中遇到的问题
    Maven 如何发布 jar 包到 Nexus 私库
    java泛型的基本使用
  • 原文地址:https://www.cnblogs.com/dudumiaomiao/p/7242002.html
Copyright © 2011-2022 走看看