zoukankan      html  css  js  c++  java
  • Java使用PDFBox操作PDF文件获取页码、文章内容、缩略图

    
    一、依赖
    
    <!--使用的是pdfbox计数总页数与缩略图-->
    <!-- https://mvnrepository.com/artifact/com.sleepycat/je -->
    <dependency>
        <groupId>com.sleepycat</groupId>
        <artifactId>je</artifactId>
        <version>5.0.73</version>
    </dependency>
    
    <!--pdf-->
    <!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox -->
    <dependency>
        <groupId>org.apache.pdfbox</groupId>
        <artifactId>pdfbox</artifactId>
        <version>2.0.8</version>
    </dependency>

    二、实现代码

    
    
    import lombok.extern.slf4j.Slf4j;
    import org.apache.pdfbox.pdfparser.PDFParser;
    import org.apache.pdfbox.pdmodel.PDDocument;
    import org.apache.pdfbox.rendering.ImageType;
    import org.apache.pdfbox.rendering.PDFRenderer;
    import org.apache.pdfbox.text.PDFTextStripper;
    
    import javax.imageio.IIOImage;
    import javax.imageio.ImageIO;
    import javax.imageio.ImageWriter;
    import javax.imageio.stream.ImageOutputStream;
    import java.awt.image.BufferedImage;
    import java.io.File;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.util.Iterator;
    @Slf4j
    public class PdfUtil {
    
    
        /**
         * 通过PDFbox获取文章总页数
         *
         * @param filePath:文件路径
         * @return
         * @throws IOException
         */
        public static int getNumberOfPages(String filePath) throws IOException, InterruptedException {
                    File file = new File(filePath);
                    PDDocument pdDocument = PDDocument.load(new File(filePath));
                    int pages = pdDocument.getNumberOfPages();
                    pdDocument.close();
                    return pages;
            }
        }
    
        /**
         * 通过PDFbox获取文章内容
         *
         * @param filePath
         * @return
         */
        public static String getContent(String filePath) throws IOException {
            PDFParser pdfParser = new PDFParser(new org.apache.pdfbox.io.RandomAccessFile(new File(filePath), "rw"));
            pdfParser.parse();
            PDDocument pdDocument = pdfParser.getPDDocument();
            String text = new PDFTextStripper().getText(pdDocument);
            pdDocument.close();
    
            return text;
        }
    
        /**
         * 通过PDFbox生成文件的缩略图
         *
         * @param filePath:文件路径
         * @param outPath:输出图片路径
         * @throws IOException
         */
        public static void getThumbnails(String filePath, String outPath) throws IOException {
            // 利用PdfBox生成图像
            PDDocument pdDocument = PDDocument.load(new File(filePath));
            PDFRenderer renderer = new PDFRenderer(pdDocument);
    
            // 构造图片
            BufferedImage img_temp = renderer.renderImageWithDPI(0, 30, ImageType.RGB);
            // 设置图片格式
            Iterator<ImageWriter> it = ImageIO.getImageWritersBySuffix("png");
            // 将文件写出
            ImageWriter writer = (ImageWriter) it.next();
            ImageOutputStream imageout = ImageIO.createImageOutputStream(new FileOutputStream(outPath));
            writer.setOutput(imageout);
            writer.write(new IIOImage(img_temp, null, null));
            img_temp.flush();
            imageout.flush();
            imageout.close();
            //Warning: You did not close a PDF Document
            pdDocument.close();
        }
    }

    三、测试类--Main

    
    
    import java.io.IOException;
    /**
     * @author Mr.lu
     * @Title: Main
     * @ProjectName DocCloud
     * @Description: TODO
     * @date 2018/11/6:22:17
     */
    public class Main {
        public static void main(String[] args) throws IOException, InterruptedException {
            int numberOfPages = getNumberOfPages("D:\Desktop\DocCloud\testDir\hadoopClientCode.pdf");
            System.out.println(numberOfPages);
            String content = getContent("");
            System.out.println(content);
           getThumbnails("D:\Desktop\DocCloud\testDir\hadoopClientCoed.pdf",
                    "D:\Desktop\DocCloud\testDir\hadoopClientCoed.pdf.png");
        }
    }
    

    1>首先测试生成PDF文件的页码,在控制台可以看到

    2>测试获取PDF文件的内容,在控制台可以看到--你自己PDF文件中的内容

    3>测试生成PDF缩略图

    缩略图的大小,可以在代码中修改

  • 相关阅读:
    strcat strcpy 使用出现的问题汇总
    MySql Host is blocked because of many connection errors; unblock with 'mysqladmin flush-hosts' 解决方法
    nginx 设置反响代理实现nginx集群
    js 去掉字符串最后一个字符
    二维数组 获取某键值集合
    oracle 序列
    递归数据查询
    oracle 递归查询
    jQuery EasyUI API 中文文档
    SecureCRT使用的技巧 键盘修改
  • 原文地址:https://www.cnblogs.com/pigdata/p/10305576.html
Copyright © 2011-2022 走看看