zoukankan      html  css  js  c++  java
  • 获取pdf、doc/docx文本数据

    1、依赖关系

    <dependency>
         <groupId>org.apache.pdfbox</groupId>
         <artifactId>pdfbox</artifactId>
         <version>2.0.12</version>
         </dependency>
         <dependency>
         <groupId>org.apache.poi</groupId>
         <artifactId>poi</artifactId>
         <version>3.17</version>
         </dependency>
         <dependency>
         <groupId>org.apache.poi</groupId>
         <artifactId>poi-ooxml</artifactId>
         <version>3.17</version>
         </dependency>
         <dependency>
         <groupId>org.apache.poi</groupId>
         <artifactId>poi-scratchpad</artifactId>
         <version>3.17</version>
         </dependency>

    2.代码

    package com.lucene.util;
    
    import com.zxf.lucene.common.consts.FileSuffix;
    import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
    import org.apache.pdfbox.io.RandomAccessRead;
    import org.apache.pdfbox.pdfparser.PDFParser;
    import org.apache.pdfbox.pdmodel.PDDocument;
    import org.apache.pdfbox.text.PDFTextStripper;
    import org.apache.poi.hwpf.extractor.WordExtractor;
    import org.apache.poi.openxml4j.opc.OPCPackage;
    import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
    import org.slf4j.Logger;
    import org.slf4j.LoggerFactory;
    
    import java.io.File;
    import java.io.FileInputStream;
    
    public class TextUtil {
    
        private static final Logger logger = LoggerFactory.getLogger(TextUtil.class);
    
        private TextUtil(){}
        /**获取pdf、doc、docx文档的文本信息*/
        public static String getTextOfFile(String filepath){
            String text = "";
            File file = new File(filepath);
            if(!file.isFile()){
                return text;
            }
            String fileName = file.getName();
            String suffix = getSuffix(fileName);
            if (FileSuffix.PDF.equalsIgnoreCase(suffix)) {
                try{
                    RandomAccessRead randomAccessRead = new RandomAccessBufferedFileInputStream(new FileInputStream(file));
                    PDFParser pdfParser = new PDFParser(randomAccessRead);
                    pdfParser.parse();
                    try(PDDocument pdDocument = pdfParser.getPDDocument()){
                        PDFTextStripper pdfTextStripper = new PDFTextStripper();
                        text = pdfTextStripper.getText(pdDocument);
                    }
                }catch (Exception e){
                    logger.error("获取pdf文本信息出错",e);
                    return text;
                }
            } else if (FileSuffix.DOCX.equalsIgnoreCase(suffix)) {
                try(XWPFWordExtractor xwpfWordExtractor = new XWPFWordExtractor(OPCPackage.open(file))){
                    text = xwpfWordExtractor.getText();
                }catch(Exception e){
                    logger.error("获取word文档(.docx)文本信息出错",e);
                    return text;
                }
            } else if (FileSuffix.DOC.equalsIgnoreCase(suffix)) {
                try(WordExtractor wordExtractor = new WordExtractor(new FileInputStream(file))){
                    text = wordExtractor.getText();
                }catch (Exception e){
                    logger.error("获取word文档(.doc)文本信息出错",e);
                    return text;
                }
            }else{
                return text;
            }
            return text.trim().replaceAll("\r", "").replaceAll("\n", "").replaceAll("\t", "").replaceAll("\s", "");
        }
    
        /**获取文件后缀*/
        public static String getSuffix(String string){
            int one = 1;
            String douhao = ".";
            return string.substring(string.lastIndexOf(douhao)+one);
        }
    
        /**剔除路径中的不合法字符卷*/
        public static String clearIllegalCharacter(String fieldValue) {
            return fieldValue.replaceAll("[\/:\*\?"<>\\|]", "");
        }
    
    }
    人生没有彩排,每天都是现场直播!
  • 相关阅读:
    常用网站
    我的第一个 python 爬虫脚本
    在文件夹下所有文件中查找字符串(linux/windows)
    Python 列表 insert() 方法
    mysql 替换 tab 键 ( )
    访问权限的修饰符
    eclipse 快捷键
    位运算
    hadoop 环境搭建
    Hadoop 快速入门
  • 原文地址:https://www.cnblogs.com/northern-light/p/10498377.html
Copyright © 2011-2022 走看看