zoukankan html css js c++ java

apache pdfbox简单读取内容

1. maven

        <!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox -->
        <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>pdfbox</artifactId>
            <version>3.0.0-RC1</version>
        </dependency>

2.读取内容

import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.text.PDFTextStripper;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileWriter;
import java.util.Iterator;

/**
 * Description: apache pdfbox demo
 *
 * @version V1.0
 */
public class ApachePdfbox {
    public static void main(String[] args) throws Exception {
        String pdfPath = "E:\1.pdf";
        String targetPath = "E:\test\";

        //读取图片
        //readImgFromPdf(pdfPath, targetPath);

        //读取文本
        readTextFromPdf(pdfPath,targetPath);

        //读取内容包括图片和文本
        //readFromPdf(pdfPath,targetPath);
    }

    /**
     * 读取图片
     *
     * @param filePath
     * @param imgPath
     * @throws Exception
     */
    public static void readImgFromPdf(String filePath, String imgPath) throws Exception {
        // 加载一个pdf对象
        File file = new File(filePath);
        PDDocument doc = Loader.loadPDF(file);
        int pages = doc.getNumberOfPages();
        for (int i = 0; i < pages; i++) {
            //读取图片
            readImg(i, doc, imgPath);
        }
    }

    /**
     * 读取文本和图片
     *
     * @param filePath
     * @throws Exception
     */
    public static void readFromPdf(String filePath, String targetPath) throws Exception {
        // 加载一个pdf对象
        PDDocument doc = Loader.loadPDF(new File(filePath));
        int pages = doc.getNumberOfPages();
        //获取文件名称
        String substring = filePath.substring(filePath.lastIndexOf("\") + 1, filePath.lastIndexOf("."));
        String txtFile = targetPath + substring + ".txt";
        for (int i = 0; i < pages; i++) {
            //读取文字
            readText(i, doc, txtFile);
            //读取图片
            readImg(i, doc, targetPath);
        }

    }

    /**
     * 读取图片
     *
     * @param page
     * @param doc
     * @param imgPath
     * @throws Exception
     */
    private static void readImg(int page, PDDocument doc, String imgPath) throws Exception {
        int index = 1;
        PDPage pdPage = doc.getPage(page);
        PDResources resources = pdPage.getResources();
        Iterable<COSName> xObjectNames = resources.getXObjectNames();
        if (xObjectNames != null) {
            Iterator<COSName> iterator = xObjectNames.iterator();
            while (iterator.hasNext()) {
                COSName next = iterator.next();
                if (resources.isImageXObject(next)) {
                    PDImageXObject xObject = (PDImageXObject) resources.getXObject(next);
                    BufferedImage bufferedImage = xObject.getImage();
                    String fileName = imgPath + (page + 1) + "-" + index + ".jpg";
                    File file1 = new File(fileName);
                    ImageIO.write(bufferedImage, "jpg", file1);
                    index++;
                }
            }
        }
    }

    /**
     * 读取文本
     *
     * @param page
     * @param doc
     * @param textFile
     * @throws Exception
     */
    private static void readText(int page, PDDocument doc, String textFile) throws Exception {
        if (doc == null) {
            return;
        }
        FileWriter fileWriter = new FileWriter(textFile, true);
        AccessPermission ap = doc.getCurrentAccessPermission();
        if (!ap.canExtractContent()) {
            String tmp = "can not extract content";
            tmp = tmp + System.lineSeparator() + "--------" + (page + 1) + "/" + doc.getNumberOfPages() + "--------" + System.lineSeparator();
            fileWriter.write(tmp);
            fileWriter.close();
            return;
        }
        PDFTextStripper stripper = new PDFTextStripper();
        stripper.setSortByPosition(true);
        stripper.setStartPage(page + 1);
        stripper.setEndPage(page + 1);
        String text = stripper.getText(doc);
        text = text + System.lineSeparator() + "--------" + (page + 1) + "/" + doc.getNumberOfPages() + "--------" + System.lineSeparator();
        fileWriter.write(text);
        fileWriter.close();
    }

    /**
     * 读取文本
     *
     * @param
     * @throws Exception
     */
    public static void readTextFromPdf(String filePath, String targetPath) throws Exception {
        // 加载一个pdf对象
        PDDocument doc = Loader.loadPDF(new File(filePath));
        int pages = doc.getNumberOfPages();
        //获取文件名称
        String substring = filePath.substring(filePath.lastIndexOf("\") + 1, filePath.lastIndexOf("."));
        //文本保存的位置
        String txtFile = targetPath + substring + ".txt";
        for (int i = 0; i < pages; i++) {
            //读取文字
            readText(i, doc, txtFile);
        }
    }


}

查看全文

相关阅读:
期末总结
 作业01 第一次作业入门
 C语言I博客作业09
C语言I博客作业08
C语言I博客作业07
C语言I博客作业06
C语言|博客作业05
通过Excel批量导入数据-Java代码
 python3-easygui模块安装
 com.alibaba.fastjson.JSONArray cannot be cast to XX

原文地址：https://www.cnblogs.com/maixiaodou/p/15219667.html