zoukankan      html  css  js  c++  java
  • 根据关键字查找其在pdf 文件中的页面

    package com.icil.elsa.milestone.common.util;
    
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.List;
    
    import com.itextpdf.text.Document;
    import com.itextpdf.text.DocumentException;
    import com.itextpdf.text.pdf.PdfCopy;
    import com.itextpdf.text.pdf.PdfDictionary;
    import com.itextpdf.text.pdf.PdfImportedPage;
    import com.itextpdf.text.pdf.PdfName;
    import com.itextpdf.text.pdf.PdfReader;
    import com.itextpdf.text.pdf.parser.ContentByteUtils;
    import com.itextpdf.text.pdf.parser.ImageRenderInfo;
    import com.itextpdf.text.pdf.parser.PdfContentStreamProcessor;
    import com.itextpdf.text.pdf.parser.RenderListener;
    import com.itextpdf.text.pdf.parser.TextRenderInfo;
    
    /**
     * @Package Name :elsa-billing-service
     * @Project Name :com.icil.elsa.milestone.common.util
     * @File Name	 : PDFUtil
     * @Version	 :1.0
     * @Author	 :peterwong
     * @Creation Date:Jun 8, 20208:23:56 PM
     * @Purpose      :
     */
    public class PDFUtil {
    	
    	   public static void main(String[] args) throws IOException {
    	        //1.给定文件
    	        File pdfFile = new File("/home/peterwong/Documents/项目流程管理.pdf");
    	        //2.定义一个byte数组,长度为文件的长度
    	        byte[] pdfData = new byte[(int) pdfFile.length()];
    
    	        //3.IO流读取文件内容到byte数组
    	        FileInputStream inputStream = null;
    	        try {
    	            inputStream = new FileInputStream(pdfFile);
    	            inputStream.read(pdfData);
    	        } catch (IOException e) {
    	            throw e;
    	        } finally {
    	            if (inputStream != null) {
    	                try {
    	                    inputStream.close();
    	                } catch (IOException e) {
    	                }
    	            }
    	        }
    
    	        //4.指定关键字
    	        String keyword = "ICIL项目流程管理";
    
    	        //5.调用方法,给定关键字和文件
    	        List<float[]> positions = findKeywordPostions(pdfData, keyword);
    
    	        //6.返回值类型是  List<float[]> 每个list元素代表一个匹配的位置,分别为 float[0]所在页码  float[1]所在x轴 float[2]所在y轴
    	        System.out.println("total:" + positions.size());
    	        int start = 0;
    	        int end = 0;
    	       
    	        if (positions != null && positions.size() > 0) {
    	        	 start = (int) positions.get(0)[0];
    	        	 end = (int) positions.get(positions.size()-1)[0];
    	            for (float[] position : positions) {
    	                System.out.print("pageNum: " + (int) position[0]);
    	                System.out.print("	x: " + position[1]);
    	                System.out.println("	y: " + position[2]);
    	            }
    	        }
    	        splitPDFByRange("/home/peterwong/Documents/", "项目流程管理.pdf", 
    					"/home/peterwong/Documents/E+/",start, end);
    	       /* splitPDFByRange("D:\inputPath", "test.pdf", 
    					"/home/peterwong/Documents/E+/",16, 30);*/
    	    }
    
    	   
    	   /**
    	    * @author Reverse_XML
    	    * 把PDF 按指定页数范围 startPage 到 endPage 拆分
    	    * @param path 源PDF路径
    	    * @param fileName 源PDF文件名
    	    * @param outputPath 拆分后输出的PDF路径
    	    * @param startPage 开始页码
    	    * @param endPage 结束页码
    	    */
    	   public static void splitPDFByRange(String path, String fileName, String outputPath, 
    	   							Integer startPage, Integer endPage) {
    	       String sep = java.io.File.separator;
    	       Document document = null;
    	       PdfCopy copy = null;
    	       PdfReader reader = null;
    	       try {
    	           reader = new PdfReader(path + sep + fileName);
    	           int numberOfPages = reader.getNumberOfPages();
    	           if (endPage == 0) {
    	               endPage = numberOfPages;
    	           }
    	           String savePath = outputPath + sep +
    	   				fileName.substring(0, fileName.lastIndexOf("."))
    	                   + "_from_" + startPage + "_to_" + endPage + "_.pdf";
    	           document = new Document(reader.getPageSize(1));
    	           copy = new PdfCopy(document, new FileOutputStream(savePath));
    	           document.open();
    	           for (int i = startPage; i <= endPage; i++) {
    	               document.newPage();
    	               PdfImportedPage page = copy.getImportedPage(reader, i);
    	               copy.addPage(page);
    	           }
    	           document.close();
    	       } catch (IOException e) {
    	           System.out.println(e.getMessage());
    	       } catch (DocumentException e) {
    	    	   System.out.println(e.getMessage());
    	       } finally {
    	           if (document != null)
    	               document.close();
    	           if (reader != null)
    	               reader.close();
    	           if (copy != null)
    	               copy.close();
    	       }
    	   }
    	 
    
    	    /**
    	     * findKeywordPostions
    	     * @param pdfData     通过IO流 PDF文件转化的byte数组
    	     * @param keyword     关键字
    	     * @return List<float [ ]> : float[0]:pageNum float[1]:x float[2]:y
    	     * @throws IOException
    	     */
    	    public static List<float[]> findKeywordPostions(byte[] pdfData, String keyword) throws IOException {
    	        List<float[]> result = new ArrayList<>();
    	        List<PdfPageContentPositions> pdfPageContentPositions = getPdfContentPostionsList(pdfData);
    
    
    	        for (PdfPageContentPositions pdfPageContentPosition : pdfPageContentPositions) {
    	            List<float[]> charPositions = findPositions(keyword, pdfPageContentPosition);
    	            if (charPositions == null || charPositions.size() < 1) {
    	                continue;
    	            }
    	            result.addAll(charPositions);
    	        }
    	        return result;
    	    }
    
    
    	    private static List<PdfPageContentPositions> getPdfContentPostionsList(byte[] pdfData) throws IOException {
    	        PdfReader reader = new PdfReader(pdfData);
    
    
    	        List<PdfPageContentPositions> result = new ArrayList<>();
    
    
    	        int pages = reader.getNumberOfPages();
    	        for (int pageNum = 1; pageNum <= pages; pageNum++) {
    	            float width = reader.getPageSize(pageNum).getWidth();
    	            float height = reader.getPageSize(pageNum).getHeight();
    
    
    	            PdfRenderListener pdfRenderListener = new PdfRenderListener(pageNum, width, height);
    
    
    	            //解析pdf,定位位置
    	            PdfContentStreamProcessor processor = new PdfContentStreamProcessor(pdfRenderListener);
    	            PdfDictionary pageDic = reader.getPageN(pageNum);
    	            PdfDictionary resourcesDic = pageDic.getAsDict(PdfName.RESOURCES);
    	            try {
    	                processor.processContent(ContentByteUtils.getContentBytesForPage(reader, pageNum), resourcesDic);
    	            } catch (IOException e) {
    	                reader.close();
    	                throw e;
    	            }
    
    
    	            String content = pdfRenderListener.getContent();
    	            List<CharPosition> charPositions = pdfRenderListener.getcharPositions();
    
    
    	            List<float[]> positionsList = new ArrayList<>();
    	            for (CharPosition charPosition : charPositions) {
    	                float[] positions = new float[]{charPosition.getPageNum(), charPosition.getX(), charPosition.getY()};
    	                positionsList.add(positions);
    	            }
    
    
    	            PdfPageContentPositions pdfPageContentPositions = new PdfPageContentPositions();
    	            pdfPageContentPositions.setContent(content);
    	            pdfPageContentPositions.setPostions(positionsList);
    
    
    	            result.add(pdfPageContentPositions);
    	        }
    	        reader.close();
    	        return result;
    	    }
    
    
    	    private static List<float[]> findPositions(String keyword, PdfPageContentPositions pdfPageContentPositions) {
    
    
    	        List<float[]> result = new ArrayList<>();
    
    
    	        String content = pdfPageContentPositions.getContent();
    	        List<float[]> charPositions = pdfPageContentPositions.getPositions();
    
    
    	        for (int pos = 0; pos < content.length(); ) {
    	            int positionIndex = content.indexOf(keyword, pos);
    	            if (positionIndex == -1) {
    	                break;
    	            }
    	            System.out.println("page is "+ positionIndex);
    	            float[] postions = charPositions.get(positionIndex);
    	            result.add(postions);
    	            pos = positionIndex + 1;
    	        }
    	        return result;
    	    }
    
    
    	    private static class PdfPageContentPositions {
    	        private String content;
    	        private List<float[]> positions;
    
    
    	        public String getContent() {
    	            return content;
    	        }
    
    
    	        public void setContent(String content) {
    	            this.content = content;
    	        }
    
    
    	        public List<float[]> getPositions() {
    	            return positions;
    	        }
    
    
    	        public void setPostions(List<float[]> positions) {
    	            this.positions = positions;
    	        }
    	    }
    
    
    
    	    private static class PdfRenderListener implements RenderListener {
    	        private int pageNum;
    	        private float pageWidth;
    	        private float pageHeight;
    	        private StringBuilder contentBuilder = new StringBuilder();
    	        private List<CharPosition> charPositions = new ArrayList<>();
    
    
    	        public PdfRenderListener(int pageNum, float pageWidth, float pageHeight) {
    	            this.pageNum = pageNum;
    	            this.pageWidth = pageWidth;
    	            this.pageHeight = pageHeight;
    	        }
    
    
    	        public void beginTextBlock() {
    	        }
    
    
    	        public void renderText(TextRenderInfo renderInfo) {
    	            List<TextRenderInfo> characterRenderInfos = renderInfo.getCharacterRenderInfos();
    	            for (TextRenderInfo textRenderInfo : characterRenderInfos) {
    	                String word = textRenderInfo.getText();
    	                if (word.length() > 1) {
    	                    word = word.substring(word.length() - 1, word.length());
    	                }
    	               com.itextpdf.awt.geom.Rectangle2D.Float rectangle = textRenderInfo.getAscentLine().getBoundingRectange();
    
    	                float x = (float)rectangle.getX();
    	                float y = (float)rectangle.getY();
    //	                float x = (float)rectangle.getCenterX();
    //	                float y = (float)rectangle.getCenterY();
    //	                double x = rectangle.getMinX();
    //	                double y = rectangle.getMaxY();
    
    
    
    
    	                //这两个是关键字在所在页面的XY轴的百分比
    	                float xPercent = Math.round(x / pageWidth * 10000) / 10000f;
    	                float yPercent = Math.round((1 - y / pageHeight) * 10000) / 10000f;
    
    
    //	                CharPosition charPosition = new CharPosition(pageNum, xPercent, yPercent);
    	                CharPosition charPosition = new CharPosition(pageNum, (float)x, (float)y);
    	                charPositions.add(charPosition);
    	                contentBuilder.append(word);
    	            }
    	        }
    
    
    	        public void endTextBlock() {
    	        }
    
    
    	        public void renderImage(ImageRenderInfo renderInfo) {
    	        }
    
    
    	        public String getContent() {
    	            return contentBuilder.toString();
    	        }
    
    
    	        public List<CharPosition> getcharPositions() {
    	            return charPositions;
    	        }
    	    }
    
    
    	    private static class CharPosition {
    	        private int pageNum = 0;
    	        private float x = 0;
    	        private float y = 0;
    
    
    	        public CharPosition(int pageNum, float x, float y) {
    	            this.pageNum = pageNum;
    	            this.x = x;
    	            this.y = y;
    	        }
    
    
    	        public int getPageNum() {
    	            return pageNum;
    	        }
    
    
    	        public float getX() {
    	            return x;
    	        }
    
    
    	        public float getY() {
    	            return y;
    	        }
    
    
    	        @Override
    	        public String toString() {
    	            return "[pageNum=" + this.pageNum + ",x=" + this.x + ",y=" + this.y + "]";
    	        }
    	    }
    	
    	
    
    }
    
    
    
  • 相关阅读:
    鸡尾酒之白兰地
    Hadoop面试总结(三)Hbase、Spark
    Hadoop面试总结(二)MySQL
    Hadoop面试总结(一)Linux命令、Scala
    View
    用户画像项目规划
    Apache kafka
    Resume
    蔡学镛
    【git】git常用操作
  • 原文地址:https://www.cnblogs.com/wanthune/p/13353874.html
Copyright © 2011-2022 走看看