1 package test; 2 3 import java.io.FileInputStream; 4 import java.io.IOException; 5 import java.util.ArrayList; 6 import java.util.List; 7 import java.util.regex.Pattern; 8 9 import org.apache.pdfbox.cos.COSDocument; 10 import org.apache.pdfbox.pdfparser.PDFParser; 11 import org.apache.pdfbox.util.PDFTextStripper; 12 13 import com.itextpdf.text.pdf.PdfReader; 14 import com.itextpdf.text.pdf.parser.PdfTextExtractor; 15 16 public class UploadUtils { 17 18 private final static Pattern pattern = Pattern.compile("\d+"); 19 private final static int stateParaOverFlag = 800; 20 private final static int thankParaOverFlag = 800; 21 22 /** 23 * 读取pdf参考文献内容 24 * 25 * @param s 26 * @return 27 */ 28 public String readPdf(String filePath) { 29 StringBuilder buffer = new StringBuilder(); 30 FileInputStream fis = null; 31 PdfReader pdfReader = null; 32 COSDocument cosDocument = null; 33 String[] paragraphs = null; 34 PDFParser p; 35 boolean addBool = true; 36 boolean judgeState = false; 37 boolean judgeThank = false; 38 StringBuilder tempSb = new StringBuilder(); 39 try { 40 fis = new FileInputStream(filePath); 41 p = new PDFParser(fis); 42 p.parse(); 43 cosDocument = p.getDocument(); 44 // 加密文档判断 45 if (cosDocument.isEncrypted()) { 46 StringBuilder tempContent = new StringBuilder(); 47 pdfReader = new PdfReader(filePath); 48 int i = pdfReader.getNumberOfPages(); 49 for (int j = 1; j <= i; j++) { 50 tempContent.append(PdfTextExtractor.getTextFromPage(pdfReader, j)); 51 } 52 paragraphs = tempContent.toString().split(" "); 53 } else { 54 PDFTextStripper ts = new PDFTextStripper(); 55 paragraphs = ts.getText(p.getPDDocument()).split(" "); 56 } 57 boolean mark = false; 58 List<Integer> errornum = new ArrayList<Integer>(); 59 int flag = 0; 60 int endRange = paragraphs.length * 70 / 100; 61 int rangeFlag = 0; 62 for (String lineContent : paragraphs) { 63 if (judgeState) { 64 tempSb.append(lineContent); 65 if (tempSb.length() >= stateParaOverFlag) { 66 judgeState = false; 67 addBool = true; 68 tempSb.delete(0, tempSb.length() - 1); 69 } 70 } 71 if (judgeThank) { 72 tempSb.append(lineContent); 73 if (tempSb.length() >= thankParaOverFlag) { 74 judgeThank = false; 75 addBool = true; 76 tempSb.delete(0, tempSb.length() - 1); 77 } 78 } 79 if (addBool) { 80 buffer.append(lineContent); 81 } 82 if (mark && rangeFlag >= endRange) { 83 if (lineContent.length() < 5) { 84 errornum.add(++flag); 85 rangeFlag++; 86 continue; 87 } 88 if (pattern.matcher(lineContent.substring(0, 5)).find()) { 89 if (flag != 0) { 90 flag = 0; 91 errornum.clear(); 92 } 93 } else { 94 errornum.add(++flag); 95 } 96 if (errornum.size() > 2) { 97 mark = false; 98 } 99 } 100 rangeFlag++; 101 } 102 } catch (Exception e) { 103 e.printStackTrace(); 104 } finally { 105 if (fis != null) { 106 try { 107 fis.close(); 108 } catch (IOException e) { 109 e.printStackTrace(); 110 } finally { 111 fis = null; 112 } 113 } 114 if (pdfReader != null) { 115 pdfReader.close(); 116 } 117 if (cosDocument != null) { 118 try { 119 cosDocument.close(); 120 } catch (IOException e) { 121 e.printStackTrace(); 122 } finally { 123 cosDocument = null; 124 } 125 } 126 } 127 return buffer.toString(); 128 } 129 130 public static boolean isBlank(CharSequence cs) { 131 int strLen; 132 if (cs == null || (strLen = cs.length()) == 0) { 133 return true; 134 } 135 for (int i = 0; i < strLen; i++) { 136 if (Character.isWhitespace(cs.charAt(i)) == false) { 137 return false; 138 } 139 } 140 return true; 141 } 142 143 public static void main(String[] args) { 144 // System.err.println(new UploadUtils() 145 // .readPdf("/opt/fileCache/2014/125/13/shuangping_D7037870CF4FC5C421A3E5359DCF8BBE.pdf")); 146 System.err.println(new UploadUtils().readPdf("E:\MyWork\guyezhai\pdf提取\路径依赖视角下高校新专业建设的策略创新(1).pdf")); 147 148 } 149 150 }
其中用到的jar包:
bcpkix-jdk15on-1.47.jar bcprov-jdk15on-1.49.jar commons-logging-1.1.2.jar fontbox-1.8.2.jar icu4j-4.0.1.jar itextpdf-5.4.3.jar jempbox-1.8.2.jar pdfbox-1.8.2.jar