zoukankan      html  css  js  c++  java
  • apache pdfbox简单读取内容

    1. maven

            <!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox -->
            <dependency>
                <groupId>org.apache.pdfbox</groupId>
                <artifactId>pdfbox</artifactId>
                <version>3.0.0-RC1</version>
            </dependency>

    2.读取内容

    import org.apache.pdfbox.Loader;
    import org.apache.pdfbox.cos.COSName;
    import org.apache.pdfbox.pdmodel.PDDocument;
    import org.apache.pdfbox.pdmodel.PDPage;
    import org.apache.pdfbox.pdmodel.PDResources;
    import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
    import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
    import org.apache.pdfbox.text.PDFTextStripper;
    
    import javax.imageio.ImageIO;
    import java.awt.image.BufferedImage;
    import java.io.File;
    import java.io.FileWriter;
    import java.util.Iterator;
    
    /**
     * Description: apache pdfbox demo
     *
     * @version V1.0
     */
    public class ApachePdfbox {
        public static void main(String[] args) throws Exception {
            String pdfPath = "E:\1.pdf";
            String targetPath = "E:\test\";
    
            //读取图片
            //readImgFromPdf(pdfPath, targetPath);
    
            //读取文本
            readTextFromPdf(pdfPath,targetPath);
    
            //读取内容包括图片和文本
            //readFromPdf(pdfPath,targetPath);
        }
    
        /**
         * 读取图片
         *
         * @param filePath
         * @param imgPath
         * @throws Exception
         */
        public static void readImgFromPdf(String filePath, String imgPath) throws Exception {
            // 加载一个pdf对象
            File file = new File(filePath);
            PDDocument doc = Loader.loadPDF(file);
            int pages = doc.getNumberOfPages();
            for (int i = 0; i < pages; i++) {
                //读取图片
                readImg(i, doc, imgPath);
            }
        }
    
        /**
         * 读取文本和图片
         *
         * @param filePath
         * @throws Exception
         */
        public static void readFromPdf(String filePath, String targetPath) throws Exception {
            // 加载一个pdf对象
            PDDocument doc = Loader.loadPDF(new File(filePath));
            int pages = doc.getNumberOfPages();
            //获取文件名称
            String substring = filePath.substring(filePath.lastIndexOf("\") + 1, filePath.lastIndexOf("."));
            String txtFile = targetPath + substring + ".txt";
            for (int i = 0; i < pages; i++) {
                //读取文字
                readText(i, doc, txtFile);
                //读取图片
                readImg(i, doc, targetPath);
            }
    
        }
    
        /**
         * 读取图片
         *
         * @param page
         * @param doc
         * @param imgPath
         * @throws Exception
         */
        private static void readImg(int page, PDDocument doc, String imgPath) throws Exception {
            int index = 1;
            PDPage pdPage = doc.getPage(page);
            PDResources resources = pdPage.getResources();
            Iterable<COSName> xObjectNames = resources.getXObjectNames();
            if (xObjectNames != null) {
                Iterator<COSName> iterator = xObjectNames.iterator();
                while (iterator.hasNext()) {
                    COSName next = iterator.next();
                    if (resources.isImageXObject(next)) {
                        PDImageXObject xObject = (PDImageXObject) resources.getXObject(next);
                        BufferedImage bufferedImage = xObject.getImage();
                        String fileName = imgPath + (page + 1) + "-" + index + ".jpg";
                        File file1 = new File(fileName);
                        ImageIO.write(bufferedImage, "jpg", file1);
                        index++;
                    }
                }
            }
        }
    
        /**
         * 读取文本
         *
         * @param page
         * @param doc
         * @param textFile
         * @throws Exception
         */
        private static void readText(int page, PDDocument doc, String textFile) throws Exception {
            if (doc == null) {
                return;
            }
            FileWriter fileWriter = new FileWriter(textFile, true);
            AccessPermission ap = doc.getCurrentAccessPermission();
            if (!ap.canExtractContent()) {
                String tmp = "can not extract content";
                tmp = tmp + System.lineSeparator() + "--------" + (page + 1) + "/" + doc.getNumberOfPages() + "--------" + System.lineSeparator();
                fileWriter.write(tmp);
                fileWriter.close();
                return;
            }
            PDFTextStripper stripper = new PDFTextStripper();
            stripper.setSortByPosition(true);
            stripper.setStartPage(page + 1);
            stripper.setEndPage(page + 1);
            String text = stripper.getText(doc);
            text = text + System.lineSeparator() + "--------" + (page + 1) + "/" + doc.getNumberOfPages() + "--------" + System.lineSeparator();
            fileWriter.write(text);
            fileWriter.close();
        }
    
        /**
         * 读取文本
         *
         * @param
         * @throws Exception
         */
        public static void readTextFromPdf(String filePath, String targetPath) throws Exception {
            // 加载一个pdf对象
            PDDocument doc = Loader.loadPDF(new File(filePath));
            int pages = doc.getNumberOfPages();
            //获取文件名称
            String substring = filePath.substring(filePath.lastIndexOf("\") + 1, filePath.lastIndexOf("."));
            //文本保存的位置
            String txtFile = targetPath + substring + ".txt";
            for (int i = 0; i < pages; i++) {
                //读取文字
                readText(i, doc, txtFile);
            }
        }
    
    
    }
  • 相关阅读:
    Word+PS制作拼音表格
    VC6.0 突然打不开dsw 工程文件的解决方案
    C# 字符串的连接
    ASP.NET中弹出消息框的几种常见方法
    用五分钟重温委托,匿名方法,Lambda,泛型委托,表达式树
    WPF 显示模态窗口和窗体
    mysql5.5安装图解
    Microsoft Visual Studio 2010 Service Pack 1(exe)
    HTTP错误 404.17
    2014-2-7
  • 原文地址:https://www.cnblogs.com/maixiaodou/p/15219667.html
Copyright © 2011-2022 走看看