zoukankan      html  css  js  c++  java
  • python 将pdf分页后插入至word中

    所用技术

      1. python编程基础

      2. 使用pyPdf

      3. 使用python操作word

      4. 正则表达式的使用

      5. windows的bat编程

    下面是一个pyPdf库使用的示例:

        from pyPdf import PdfFileWriter, PdfFileReader
    
        output = PdfFileWriter()
        input1 = PdfFileReader(file("document1.pdf", "rb"))
    
        # add page 1 from input1 to output document, unchanged
        output.addPage(input1.getPage(0))
    
        # add page 2 from input1, but rotated clockwise 90 degrees
        output.addPage(input1.getPage(1).rotateClockwise(90))
    
        # add page 3 from input1, rotated the other way:
        output.addPage(input1.getPage(2).rotateCounterClockwise(90))
        # alt: output.addPage(input1.getPage(2).rotateClockwise(270))
    
        # add page 4 from input1, but first add a watermark from another pdf:
        page4 = input1.getPage(3)
        watermark = PdfFileReader(file("watermark.pdf", "rb"))
        page4.mergePage(watermark.getPage(0))
    
        # add page 5 from input1, but crop it to half size:
        page5 = input1.getPage(4)
        page5.mediaBox.upperRight = (
            page5.mediaBox.getUpperRight_x() / 2,
            page5.mediaBox.getUpperRight_y() / 2
        )
        output.addPage(page5)
    
        # print how many pages input1 has:
        print "document1.pdf has %s pages." % input1.getNumPages())
    
        # finally, write "output" to document-output.pdf
        outputStream = file("document-output.pdf", "wb")
        output.write(outputStream)

    有了该库,就可以很容易将现有的pdf做分割。

    因为我的需求是要将pdf中的关键字提取出来,用它来作为文件名。pyPdf中提供了将pdf中的文字全部提取出来。

    inputfile.getPage(0).extractText()

    这里返回的unicode,需要转为str

    inputfile.getPage(0).extractText().encode("utf-8")

    然后将每页的关键字提取出来,增加函数如下:

    p_sheetName = re.compile('Blattname: (.+?)project')
    def getSheetName(str):
        m = p_sheetName.search(str)
        if m:
            return m.group(1)
        else:
            return None;

    最终代码如下:

    from pyPdf import PdfFileWriter, PdfFileReader
    import re,os
    
    p_sheetName = re.compile('Blattname: (.+?)project')
    def getSheetName(str):
        m = p_sheetName.search(str)
        if m:
            return m.group(1)
        else:
            return None;
    
    def splitpdf(srcFile):
            input1 = file(srcFile,"rb")
            inputfile = PdfFileReader(input1)
            numofpages = inputfile.getNumPages()
            print "pages: %d" % numofpages
            #new directory
            folderName,ext_ = os.path.splitext(srcFile)
            if not os.path.isdir(folderName):
                os.makedirs(folderName)
            for page_index in range(1,numofpages+1):
                output = PdfFileWriter()
                output.addPage(inputfile.getPage(page_index-1))
                
                sheetName = getSheetName(inputfile.getPage(page_index-1).extractText().encode("utf-8"))
                #save file
                saveFileName = os.path.join(folderName,"%d %s.pdf" % (page_index,sheetName))
                print saveFileName
                outputFile = file(saveFileName,"wb")
                output.write(outputFile)
                outputFile.close()
            input1.close()
    
    
    splitpdf("E:\test.pdf")

    下一步,将pdf参数化

    from pyPdf import PdfFileWriter, PdfFileReader
    import re,sys,os,string
    
    def translator(frm='', to='', delete='', keep=None):
        if len(to) == 1 :
            to = to * len(frm)
        trans = string.maketrans(frm,to)
        if keep is not None:
            allchars = string.maketrans('','')
            delete = allchars.translate(allchars,keep.translate(allchars,delete))
        def translate(s):
            return s.translate(trans,delete)
        return translate
    
    delete_some_speicl = translator(delete="/:\?*><|")
    
    p_sheetName = re.compile('Blattname: (.+?)project')
    def getSheetName(str):
        m = p_sheetName.search(str)
        return delete_some_speicl(m.group(1))
    
    def splitpdf(srcFile):
        try:
            folderName,ext_ = os.path.splitext(srcFile)
            if ext_ != '.pdf':
                raise Exception(os.path.basename(srcFile) + " is not pdf!")
            input1 = file(srcFile,"rb")
            inputfile = PdfFileReader(input1)
            numofpages = inputfile.getNumPages()
            print "pages: %d" % numofpages
            #new directory
            if not os.path.isdir(folderName):
                os.makedirs(folderName)
            for page_index in range(1,numofpages+1):
                output = PdfFileWriter()
                output.addPage(inputfile.getPage(page_index-1))
                
                sheetName = getSheetName(inputfile.getPage(page_index-1).extractText().encode("utf-8"))
                #save file
                saveFileName = os.path.join(folderName,"%d %s.pdf" % (page_index,sheetName))
                print saveFileName
                outputFile = file(saveFileName,"wb")
                output.write(outputFile)
                outputFile.close()
            input1.close()
            print "Split success!"
            print "please find them at " + folderName
        except Exception,e:
            print e
    
    if __name__ == '__main__':
        if len(sys.argv) < 2:
            print 'usage: %s filename' % os.path.basename(sys.argv[0])
            exit(0)
        #print sys.argv[1]
        splitpdf(sys.argv[1])   

    这里translator函数是将关键字中的特殊字符过滤掉,因为新建文件时可能会出错。

    其实分开pdf也还需要一些手动操作,不然还需用vba导入到word中,我想直接用python干完这些事,如果就用到了win32com来操作word

    下面是使用操作word的一个示例:

    import win32com
    from win32com.client import Dispatch, constants
    
    w = win32com.client.Dispatch('Word.Application')
    # 或者使用下面的方法,使用启动独立的进程:
    # w = win32com.client.DispatchEx('Word.Application')
    
    # 后台运行,不显示,不警告
    w.Visible = 0
    w.DisplayAlerts = 0
    
    # 打开新的文件
    doc = w.Documents.Open( FileName = filenamein )
    # worddoc = w.Documents.Add() # 创建新的文档
    
    # 插入文字
    myRange = doc.Range(0,0)
    myRange.InsertBefore('Hello from Python!')
    
    # 使用样式
    wordSel = myRange.Select()
    wordSel.Style = constants.wdStyleHeading1
    
    # 正文文字替换
    w.Selection.Find.ClearFormatting()
    w.Selection.Find.Replacement.ClearFormatting()
    w.Selection.Find.Execute(OldStr, False, False, False, False, False, True, 1, True, NewStr, 2)
    
    # 页眉文字替换
    w.ActiveDocument.Sections[0].Headers[0].Range.Find.ClearFormatting()
    w.ActiveDocument.Sections[0].Headers[0].Range.Find.Replacement.ClearFormatting()
    w.ActiveDocument.Sections[0].Headers[0].Range.Find.Execute(OldStr, False, False, False, False, False, True, 1, False, NewStr, 2)
    
    # 表格操作
    doc.Tables[0].Rows[0].Cells[0].Range.Text ='123123'
    worddoc.Tables[0].Rows.Add() # 增加一行
    
    # 转换为html
    wc = win32com.client.constants
    w.ActiveDocument.WebOptions.RelyOnCSS = 1
    w.ActiveDocument.WebOptions.OptimizeForBrowser = 1
    w.ActiveDocument.WebOptions.BrowserLevel = 0 # constants.wdBrowserLevelV4
    w.ActiveDocument.WebOptions.OrganizeInFolder = 0
    w.ActiveDocument.WebOptions.UseLongFileNames = 1
    w.ActiveDocument.WebOptions.RelyOnVML = 0
    w.ActiveDocument.WebOptions.AllowPNG = 1
    w.ActiveDocument.SaveAs( FileName = filenameout, FileFormat = wc.wdFormatHTML )
    
    # 打印
    doc.PrintOut()
    
    # 关闭
    # doc.Close()
    w.Documents.Close(wc.wdDoNotSaveChanges)
    w.Quit()

    仿照上例,修改前面的代码如下:

    from pyPdf import PdfFileWriter, PdfFileReader
    import re,sys,os,string,win32com
    from win32com.client import Dispatch, constants
    win32com.client.gencache.EnsureDispatch('Word.Application')
    
    
    def translator(frm='', to='', delete='', keep=None):
        if len(to) == 1 :
            to = to * len(frm)
        trans = string.maketrans(frm,to)
        if keep is not None:
            allchars = string.maketrans('','')
            delete = allchars.translate(allchars,keep.translate(allchars,delete))
        def translate(s):
            return s.translate(trans,delete)
        return translate
    
    delete_some_speicl = translator(delete="/:\?*><|")
    
    p_sheetName = re.compile('Blattname: (.+?)project')
    def getSheetName(str):
        m = p_sheetName.search(str)
        return m.group(1)
    
    def splitPdfToWord(srcFile):
        try:
            folderName,ext_ = os.path.splitext(srcFile)
            if ext_ != '.pdf':
                raise Exception(os.path.basename(srcFile) + " is not pdf!")
            input1 = file(srcFile,"rb")
            inputfile = PdfFileReader(input1)
            numofpages = inputfile.getNumPages()
            print "Total Pages: %d" % numofpages
            wordApp = win32com.client.Dispatch('Word.Application')
            wordApp.Visible = False
            wordApp.DisplayAlerts = 0
            doc = wordApp.Documents.Add()
            sel = wordApp.Selection
            #new directory
            if not os.path.isdir(folderName):
                os.makedirs(folderName)
            for page_index in range(1,numofpages+1):
                output = PdfFileWriter()
                output.addPage(inputfile.getPage(page_index-1))
                
                sheetName = getSheetName(inputfile.getPage(page_index-1).extractText().encode("utf-8"))
                sel.Style = constants.wdStyleHeading1
                sel.TypeText("Page%d %s" % (page_index,sheetName))
                sheetName = delete_some_speicl(sheetName)
                #save file
                saveFileName = os.path.join(folderName,"%d %s.pdf" % (page_index,sheetName))
                print "Add Page %d" % page_index
                #print saveFileName
                outputFile = file(saveFileName,"wb")
                output.write(outputFile)
                outputFile.close()
                sel.TypeParagraph()
                sel.Style = constants.wdStyleBodyText
                sel.InlineShapes.AddOLEObject(ClassType="AcroExch.Document.11",FileName=saveFileName)
                sel.InsertBreak(Type=constants.wdPageBreak)
            input1.close()
            doc.SaveAs(folderName+".doc")
            print "Split success!"
            print "please find them at " + folderName
            print "create word document success!"
            print "Location:" + folderName + ".doc"
        except Exception,e:
            print e
        finally:
            wordApp.Quit()
    
    if __name__ == '__main__':
        if len(sys.argv) < 2:
            print 'usage: %s filename' % os.path.basename(sys.argv[0])
            sys.exit(1)
        splitPdfToWord(sys.argv[1])   
  • 相关阅读:
    js 页面按钮提交后 创建显示loading div 操作完成后 再隐藏或删除 进度div
    [转]利用vertical-align:middle实现在整个页面居中
    IP地址查询
    [转]js 判断js函数、变量是否存在
    [转]RDLC报表格式化format表达式
    [转]不用安装Oracle Client如何使用PLSQL Developer
    [转]使用 YCombo 做 JS /CSS开发 合并 压缩
    [转]jQuery为控件添加水印文字
    [转]DataTable用中使用Compute 实现简单的DataTable数据的统计
    [转]Web性能监控自动化探索之路–初识WebPageTest
  • 原文地址:https://www.cnblogs.com/zhangyonghugo/p/3501065.html
Copyright © 2011-2022 走看看