1. maven
<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox --> <dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox</artifactId> <version>3.0.0-RC1</version> </dependency>
2.读取内容
import org.apache.pdfbox.Loader; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDResources; import org.apache.pdfbox.pdmodel.encryption.AccessPermission; import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; import org.apache.pdfbox.text.PDFTextStripper; import javax.imageio.ImageIO; import java.awt.image.BufferedImage; import java.io.File; import java.io.FileWriter; import java.util.Iterator; /** * Description: apache pdfbox demo * * @version V1.0 */ public class ApachePdfbox { public static void main(String[] args) throws Exception { String pdfPath = "E:\1.pdf"; String targetPath = "E:\test\"; //读取图片 //readImgFromPdf(pdfPath, targetPath); //读取文本 readTextFromPdf(pdfPath,targetPath); //读取内容包括图片和文本 //readFromPdf(pdfPath,targetPath); } /** * 读取图片 * * @param filePath * @param imgPath * @throws Exception */ public static void readImgFromPdf(String filePath, String imgPath) throws Exception { // 加载一个pdf对象 File file = new File(filePath); PDDocument doc = Loader.loadPDF(file); int pages = doc.getNumberOfPages(); for (int i = 0; i < pages; i++) { //读取图片 readImg(i, doc, imgPath); } } /** * 读取文本和图片 * * @param filePath * @throws Exception */ public static void readFromPdf(String filePath, String targetPath) throws Exception { // 加载一个pdf对象 PDDocument doc = Loader.loadPDF(new File(filePath)); int pages = doc.getNumberOfPages(); //获取文件名称 String substring = filePath.substring(filePath.lastIndexOf("\") + 1, filePath.lastIndexOf(".")); String txtFile = targetPath + substring + ".txt"; for (int i = 0; i < pages; i++) { //读取文字 readText(i, doc, txtFile); //读取图片 readImg(i, doc, targetPath); } } /** * 读取图片 * * @param page * @param doc * @param imgPath * @throws Exception */ private static void readImg(int page, PDDocument doc, String imgPath) throws Exception { int index = 1; PDPage pdPage = doc.getPage(page); PDResources resources = pdPage.getResources(); Iterable<COSName> xObjectNames = resources.getXObjectNames(); if (xObjectNames != null) { Iterator<COSName> iterator = xObjectNames.iterator(); while (iterator.hasNext()) { COSName next = iterator.next(); if (resources.isImageXObject(next)) { PDImageXObject xObject = (PDImageXObject) resources.getXObject(next); BufferedImage bufferedImage = xObject.getImage(); String fileName = imgPath + (page + 1) + "-" + index + ".jpg"; File file1 = new File(fileName); ImageIO.write(bufferedImage, "jpg", file1); index++; } } } } /** * 读取文本 * * @param page * @param doc * @param textFile * @throws Exception */ private static void readText(int page, PDDocument doc, String textFile) throws Exception { if (doc == null) { return; } FileWriter fileWriter = new FileWriter(textFile, true); AccessPermission ap = doc.getCurrentAccessPermission(); if (!ap.canExtractContent()) { String tmp = "can not extract content"; tmp = tmp + System.lineSeparator() + "--------" + (page + 1) + "/" + doc.getNumberOfPages() + "--------" + System.lineSeparator(); fileWriter.write(tmp); fileWriter.close(); return; } PDFTextStripper stripper = new PDFTextStripper(); stripper.setSortByPosition(true); stripper.setStartPage(page + 1); stripper.setEndPage(page + 1); String text = stripper.getText(doc); text = text + System.lineSeparator() + "--------" + (page + 1) + "/" + doc.getNumberOfPages() + "--------" + System.lineSeparator(); fileWriter.write(text); fileWriter.close(); } /** * 读取文本 * * @param * @throws Exception */ public static void readTextFromPdf(String filePath, String targetPath) throws Exception { // 加载一个pdf对象 PDDocument doc = Loader.loadPDF(new File(filePath)); int pages = doc.getNumberOfPages(); //获取文件名称 String substring = filePath.substring(filePath.lastIndexOf("\") + 1, filePath.lastIndexOf(".")); //文本保存的位置 String txtFile = targetPath + substring + ".txt"; for (int i = 0; i < pages; i++) { //读取文字 readText(i, doc, txtFile); } } }