[转]JAVA代码PDFBOX对pdf文件的操作

zoukankan html css js c++ java

[转]JAVA代码PDFBOX对pdf文件的操作
转载地址：http://blog.csdn.net/reserved_person/article/details/52785153

PDFBox是Java实现的PDF文档协作类库，提供PDF文档的创建、处理以及文档内容提取功能，也包含了一些命令行实用工具。其主要特性包括：
1、提取PDF文件的Unicode文本
2、将PDF切分成多个PDF文件或合并多个PDF文件
3、从PDF表格中提取数据或填写PDF表格
4、验证PDF文件是否符合PDF/A-1b标准
5、使用标准的Java API打印PDF文件
6、将PDF文件保存为图像文件，如PNG、JPEG
7、创建一个PDF文件，包含嵌入的字体和图像
8、PDF文件进行数字签名，即对PDF 文档进行加密与解密

[java] view plain copy

print ?

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.OutputStreamWriter;

import java.io.Writer;

import java.text.SimpleDateFormat;

import java.util.Calendar;

import java.util.List;

import java.util.Map;

import java.util.logging.Level;

import java.util.logging.Logger;

import org.apache.pdfbox.pdfparser.PDFParser;

import org.apache.pdfbox.pdmodel.PDDocument;

import org.apache.pdfbox.pdmodel.PDDocumentCatalog;

import org.apache.pdfbox.pdmodel.PDDocumentInformation;

import org.apache.pdfbox.pdmodel.PDPage;

import org.apache.pdfbox.pdmodel.PDResources;

import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;

import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;

import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;

import org.apache.pdfbox.util.PDFTextStripper;

importstatic readPDFContent.PDFParse.dateFormat;



/**

*

* @author Angela

*/publicclassPDFReader {/**

     * 获取格式化后的时间信息

     * @param calendar   时间信息

     * @return     */publicstatic String dateFormat( Calendar calendar ){

        if( null == calendar )

            returnnull;

        String date = null;

        String pattern = "yyyy-MM-dd HH:mm:ss";

        SimpleDateFormat format = new SimpleDateFormat( pattern );

        date = format.format( calendar.getTime() );

        return date == null ? "" : date;

    }



        /**打印纲要**/publicstaticvoidgetPDFOutline(String file){

        try {

            //打开pdf文件流

            FileInputStream fis = new   FileInputStream(file);

            //加载 pdf 文档,获取PDDocument文档对象

            PDDocument document=PDDocument.load(fis);

            //获取PDDocumentCatalog文档目录对象

            PDDocumentCatalog catalog=document.getDocumentCatalog();

            //获取PDDocumentOutline文档纲要对象

            PDDocumentOutline outline=catalog.getDocumentOutline();

            //获取第一个纲要条目（标题1）

            PDOutlineItem item=outline.getFirstChild();

            if(outline!=null){

                //遍历每一个标题1while(item!=null){

                    //打印标题1的文本

                    System.out.println("Item:"+item.getTitle());

                    //获取标题1下的第一个子标题（标题2）

                    PDOutlineItem child=item.getFirstChild();

                    //遍历每一个标题2while(child!=null){

                        //打印标题2的文本

                        System.out.println("    Child:"+child.getTitle());

                        //指向下一个标题2

                        child=child.getNextSibling();

                    }

                    //指向下一个标题1

                    item=item.getNextSibling();

                }

            }

            //关闭输入流

            document.close();

            fis.close();

        } catch (FileNotFoundException ex) {

            Logger.getLogger(PDFBOXReader.class.getName()).log(Level.SEVERE, null, ex);

        } catch (IOException ex) {

            Logger.getLogger(PDFBOXReader.class.getName()).log(Level.SEVERE, null, ex);

        }

    }



    /**打印一级目录**/publicstaticvoidgetPDFCatalog(String file){

        try {

            //打开pdf文件流

            FileInputStream fis = new   FileInputStream(file);

            //加载 pdf 文档,获取PDDocument文档对象

            PDDocument document=PDDocument.load(fis);

            //获取PDDocumentCatalog文档目录对象

            PDDocumentCatalog catalog=document.getDocumentCatalog();

            //获取PDDocumentOutline文档纲要对象

            PDDocumentOutline outline=catalog.getDocumentOutline();

            //获取第一个纲要条目（标题1）if(outline!=null){

                PDOutlineItem item=outline.getFirstChild();

                //遍历每一个标题1while(item!=null){

                    //打印标题1的文本

                    System.out.println("Item:"+item.getTitle());

                    //指向下一个标题1

                    item=item.getNextSibling();

                }

            }

            //关闭输入流

            document.close();

            fis.close();

        } catch (FileNotFoundException ex) {

            Logger.getLogger(PDFBOXReader.class.getName()).log(Level.SEVERE, null, ex);

        } catch (IOException ex) {

            Logger.getLogger(PDFBOXReader.class.getName()).log(Level.SEVERE, null, ex);

        }

    }



    /**获取PDF文档元数据**/publicstaticvoidgetPDFInformation(String file){

        try {

            //打开pdf文件流

            FileInputStream fis = new   FileInputStream(file);

            //加载 pdf 文档,获取PDDocument文档对象

            PDDocument document=PDDocument.load(fis);

            /** 文档属性信息 **/            PDDocumentInformation info = document.getDocumentInformation();



            System.out.println("页数:"+document.getNumberOfPages());



            System.out.println( "标题:" + info.getTitle() );

            System.out.println( "主题:" + info.getSubject() );

            System.out.println( "作者:" + info.getAuthor() );

            System.out.println( "关键字:" + info.getKeywords() );



            System.out.println( "应用程序:" + info.getCreator() );

            System.out.println( "pdf 制作程序:" + info.getProducer() );



            System.out.println( "Trapped:" + info.getTrapped() );



            System.out.println( "创建时间:" + dateFormat( info.getCreationDate() ));

            System.out.println( "修改时间:" + dateFormat( info.getModificationDate()));



            //关闭输入流

            document.close();

            fis.close();

        } catch (FileNotFoundException ex) {

            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);

        } catch (IOException ex) {

            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);

        }

    }



    /**提取pdf文本**/publicstaticvoidextractTXT(String file){

        try{

            //打开pdf文件流

            FileInputStream fis = new   FileInputStream(file);

            //实例化一个PDF解析器

            PDFParser parser = new PDFParser(fis);

            //解析pdf文档

            parser.parse();

            //获取PDDocument文档对象

            PDDocument document=parser.getPDDocument();

            //获取一个PDFTextStripper文本剥离对象

            PDFTextStripper stripper = new PDFTextStripper();

            //获取文本内容

            String content = stripper.getText(document);

            //打印内容

            System.out.println( "内容:" + content );

            document.close();

            fis.close();

        } catch (FileNotFoundException ex) {

            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);

        } catch (IOException ex) {

            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);

        }

    }



    /**

     * 提取部分页面文本

     * @param file pdf文档路径

     * @param startPage 开始页数

     * @param endPage 结束页数

     */publicstaticvoidextractTXT(String file,int startPage,int endPage){

        try{

            //打开pdf文件流

            FileInputStream fis = new   FileInputStream(file);

            //实例化一个PDF解析器

            PDFParser parser = new PDFParser(fis);

            //解析pdf文档

            parser.parse();

            //获取PDDocument文档对象

            PDDocument document=parser.getPDDocument();

            //获取一个PDFTextStripper文本剥离对象

            PDFTextStripper stripper = new PDFTextStripper();

            // 设置起始页

            stripper.setStartPage(startPage);

            // 设置结束页

            stripper.setEndPage(endPage);

            //获取文本内容

            String content = stripper.getText(document);

            //打印内容

            System.out.println( "内容:" + content );

            document.close();

            fis.close();

        } catch (FileNotFoundException ex) {

            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);

        } catch (IOException ex) {

            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);

        }

    }



    /**

     * 提取图片并保存

     * @param file PDF文档路径

     * @param imgSavePath 图片保存路径

     */publicstaticvoidextractImage(String file,String imgSavePath){

        try{

            //打开pdf文件流

            FileInputStream fis = new   FileInputStream(file);

            //加载 pdf 文档,获取PDDocument文档对象

            PDDocument document=PDDocument.load(fis);

            /** 文档页面信息 **///获取PDDocumentCatalog文档目录对象

            PDDocumentCatalog catalog = document.getDocumentCatalog();

            //获取文档页面PDPage列表

            List pages = catalog.getAllPages();

            int count = 1;

            int pageNum=pages.size();   //文档页数//遍历每一页for( int i = 0; i < pageNum; i++ ){

                //取得第i页

                PDPage page = ( PDPage ) pages.get( i );

                if( null != page ){

                    PDResources resource = page.findResources();

                    //获取页面图片信息

                    Map<String,PDXObjectImage> imgs = resource.getImages();

                    for(Map.Entry<String,PDXObjectImage> me: imgs.entrySet()){

                        //System.out.println(me.getKey());

                        PDXObjectImage img = me.getValue();

                        //保存图片，会自动添加图片后缀类型

                        img.write2file( imgSavePath + count );

                        count++;

                    }

                }

            }

            document.close();

            fis.close();

        } catch (FileNotFoundException ex) {

            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);

        } catch (IOException ex) {

            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);

        }

    }



    /**

     * 提取文本并保存

     * @param file PDF文档路径

     * @param savePath 文本保存路径

     */publicstaticvoidextractTXT(String file,String savePath){

        try{

            //打开pdf文件流

            FileInputStream fis = new   FileInputStream(file);

            //实例化一个PDF解析器

            PDFParser parser = new PDFParser(fis);

            //解析pdf文档

            parser.parse();

            //获取PDDocument文档对象

            PDDocument document=parser.getPDDocument();

            //获取一个PDFTextStripper文本剥离对象

            PDFTextStripper stripper = new PDFTextStripper();

            //创建一个输出流

            Writer writer=new OutputStreamWriter(new FileOutputStream(savePath));

            //保存文本内容

            stripper.writeText(document, writer);

            //关闭输出流

            writer.close();

            //关闭输入流

            document.close();

            fis.close();

        } catch (FileNotFoundException ex) {

            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);

        } catch (IOException ex) {

            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);

        }

    }



    /**

     * 提取部分页面文本并保存

     * @param file PDF文档路径

     * @param startPage 开始页数

     * @param endPage 结束页数

     * @param savePath 文本保存路径

     */publicstaticvoidextractTXT(String file,int startPage,

            int endPage,String savePath){

        try{

            //打开pdf文件流

            FileInputStream fis = new   FileInputStream(file);

            //实例化一个PDF解析器

            PDFParser parser = new PDFParser(fis);

            //解析pdf文档

            parser.parse();

            //获取PDDocument文档对象

            PDDocument document=parser.getPDDocument();

            //获取一个PDFTextStripper文本剥离对象

            PDFTextStripper stripper = new PDFTextStripper();

            //创建一个输出流

            Writer writer=new OutputStreamWriter(new FileOutputStream(savePath));

            // 设置起始页

            stripper.setStartPage(startPage);

            // 设置结束页

            stripper.setEndPage(endPage);

            //保存文本内容

            stripper.writeText(document, writer);

            //关闭输出流

            writer.close();

            //关闭输入流

            document.close();

            fis.close();

        } catch (FileNotFoundException ex) {

            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);

        } catch (IOException ex) {

            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);

        }

    }



    publicstaticvoidmain(String args[]){

        String file="F:\pdf\2013\000608_阳光股份_2013年年度报告(更新后)_1.pdf";

        String savePath="E:\result1.txt";

        long startTime=System.currentTimeMillis();

        extractTXT(file,savePath);

        long endTime=System.currentTimeMillis();

        System.out.println("读写所用时间为："+(endTime-startTime)+"ms");

    }



}
查看全文

相关阅读:
SQL操作全集 sql精典收藏
 sql中全角字符与半角字符检验问题
 asp.net目录权限设置图文综合[转]
XPath 语法
 XPath学习
 接口使用例子，阐述接口的优点、作用
 sql查询出表中所有列名判断两个表中的列是否相同
 指定键让指定的按钮提交
 C#.Net网络程序开发Socket篇
 ASP.NET中异常处理使用

原文地址：https://www.cnblogs.com/Crysaty/p/6472876.html