zoukankan      html  css  js  c++  java
  • Poi读取word(doc)文档的文本或图片

    package org.jimmy.studyproject.util;
    
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.FileOutputStream;
    import java.io.InputStream;
    import java.io.OutputStream;
    import java.util.ArrayList;
    import java.util.Arrays;
    import java.util.List;
    import java.util.regex.Pattern;
    
    import org.apache.poi.hwpf.HWPFDocument;
    import org.apache.poi.hwpf.extractor.WordExtractor;
    import org.apache.poi.hwpf.model.PicturesTable;
    import org.apache.poi.hwpf.usermodel.CharacterRun;
    import org.apache.poi.hwpf.usermodel.Picture;
    import org.apache.poi.hwpf.usermodel.Range;
    import org.apache.poi.xwpf.usermodel.XWPFDocument;
    import org.apache.poi.xwpf.usermodel.XWPFParagraph;
    
    import com.google.common.base.CharMatcher;
    import com.google.common.collect.Lists;
    
    @SuppressWarnings({"resource", "unused"})
    public class Utils {
        
        public static final String DIR_PATH = "D:/Resume/TopicSolutions/SourceFile/";
        public static final String PICTURE_DIR_PATH = "D:/Resume/TopicSolutions/Picture/";
        public static String unicodeRegStr = "[a-zA-Z0-9\u4e00-\u9fa5";
        public static final String PUNCTUATION = "\u3001,\u03c1,\u3002,\uff08,\u03c8,\uff09,\u300a,\u300b,\uff0b,\uff0c,\uff0d,\uff0e,\u33d1,\u2014,\u00d7,\u2019,\uff1a,\u005b,\uff1c,\u005d,\uff1d,\u221e,\uff1e,\uff1f,\u0060,\u2264,\u0028,\u0029,\u002b,\u222b,\u002d,\u002e,\u002f,\u00b1,\u03b8,\u007b,\u043b,\u007c,\u003c,\u003d,\u007d,\u003e";
        public static Pattern unicodeReg = null;
    
        public static void main(String[] args){
            try {
                String[] punctuationArr = PUNCTUATION.split(",");
                if(punctuationArr != null){
                    for(String punctuation : punctuationArr){
                        unicodeRegStr += punctuation;
                    }
                }
                unicodeRegStr += "]";
                System.out.println(unicodeRegStr);
                unicodeReg = Pattern.compile(unicodeRegStr);
                List<String> contextList = readWordFile();
                contextList = readSourceWordFile();
                writeWordFile(contextList);
                /*String text = getWordAndStyle();
                System.out.println(text);*/
                /*boolean flag = UNICODE_REG.matcher("").matches();
                System.out.println(flag);*/
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        
        public static void writeWordFile(List<String> contextList){
            if(contextList != null){
                contextList.forEach(c -> System.out.print(c));
            }
        }
        
        //读取每个字符样式
        public static String getWordAndStyle() throws Exception{
            String text = "";
            File dir = new File(DIR_PATH);
            if(dir != null && dir.isDirectory()){
                File[] fileArr = dir.listFiles();
                for(File file : fileArr){
                    CharacterRun cr = null;
                    FileInputStream in = new FileInputStream(file.getAbsolutePath()); 
                    HWPFDocument doc = new HWPFDocument(in); 
                    int length = doc.characterLength();
                    PicturesTable pictruesTable = doc.getPicturesTable();
                    String picFilePath = null;
                    for (int i = 0; i < length; i++){
                        Range range = new Range(i, i + 1, doc);
                        cr = range.getCharacterRun(0);
                        if(pictruesTable.hasPicture(cr)){
                            //获取图片路径
                            picFilePath = readPicture(pictruesTable, cr);
                        }
                    }
                }
            }
            return text;
        }
        
        public static String readPicture(PicturesTable pTable, CharacterRun cr) throws Exception{
            Picture pic = pTable.extractPicture(cr, false);
            String pictureFileName = pic.suggestFullFileName();
            String pictureFilePath = PICTURE_DIR_PATH + pictureFileName;
            OutputStream out = new FileOutputStream(new File(pictureFilePath));
            pic.writeImageContent(out);
            return pictureFilePath;
        }
        
        /**
         * Detail: 读取源文件,过滤乱码
         * Author: ラピスラズリ(Dawn)
         * Date: 2020年4月22日 下午5:25:17
         */
        public static <T>List<String> readWordFile() throws Exception {
            List<String> contextList = Lists.newArrayList();
            List<String> contextReplacedList = new ArrayList<String>();
            List<String> contextIndividualList = new ArrayList<String>();
            File dir = new File(DIR_PATH);
            if(dir != null && dir.isDirectory()){
                File[] fileArr = dir.listFiles();
                for(File file : fileArr){
                    InputStream stream = new FileInputStream(file);
                    String path = file.getAbsolutePath();
                    if (path.endsWith(".doc")) {
                        HWPFDocument document = new HWPFDocument(stream);
                        WordExtractor extractor = new WordExtractor(document);
                        String[] contextArray = extractor.getParagraphText();
                        Arrays.asList(contextArray).forEach(context -> contextList.add(CharMatcher.whitespace().removeFrom(context)));
                        extractor.close();
                        document.close();
                    } else if (path.endsWith(".docx")) {
                        XWPFDocument document = new XWPFDocument(stream).getXWPFDocument();
                        List<XWPFParagraph> paragraphList = document.getParagraphs();
                        paragraphList.forEach(paragraph -> contextList.add(CharMatcher.whitespace().removeFrom(paragraph.getParagraphText())));
                        document.close();
                    } 
                    if(stream != null){
                        stream.close();
                    } 
                }
            }
            contextList.forEach(c -> contextReplacedList.add(c.replaceAll("EMBEDEquation.3", "")));
            contextReplacedList.forEach(c -> {
                for(int i = 0; i < c.length(); i++){
                    String currentWord = c.charAt(i) + "";
                    if(unicodeReg.matcher(currentWord).matches()){
                        contextIndividualList.add(currentWord);
                    }
                }
            });
            return contextIndividualList;
        }
        
        /**
         * Detail: 读取源文件,有乱码
         * Author: ラピスラズリ(Dawn)
         * Date: 2020年4月22日 下午5:24:51
         */
        public static <T>List<String> readSourceWordFile() throws Exception {
            List<String> contextList = Lists.newArrayList();
            List<String> contextReplacedList = new ArrayList<String>();
            List<String> contextIndividualList = new ArrayList<String>();
            File dir = new File(DIR_PATH);
            if(dir != null && dir.isDirectory()){
                File[] fileArr = dir.listFiles();
                for(File file : fileArr){
                    InputStream stream = new FileInputStream(file);
                    String path = file.getAbsolutePath();
                    if (path.endsWith(".doc")) {
                        HWPFDocument document = new HWPFDocument(stream);
                        WordExtractor extractor = new WordExtractor(document);
                        String[] contextArray = extractor.getParagraphText();
                        Arrays.asList(contextArray).forEach(context -> contextList.add(CharMatcher.whitespace().removeFrom(context)));
                        extractor.close();
                        document.close();
                    } else if (path.endsWith(".docx")) {
                        XWPFDocument document = new XWPFDocument(stream).getXWPFDocument();
                        List<XWPFParagraph> paragraphList = document.getParagraphs();
                        paragraphList.forEach(paragraph -> contextList.add(CharMatcher.whitespace().removeFrom(paragraph.getParagraphText())));
                        document.close();
                    } 
                    if(stream != null){
                        stream.close();
                    } 
                }
            }
            contextList.forEach(c -> contextReplacedList.add(c.replaceAll("EMBEDEquation.3", "")));
            contextReplacedList.forEach(c -> {
                for(int i = 0; i < c.length(); i++){
                    String currentWord = c.charAt(i) + "";
                    contextIndividualList.add(currentWord);
                }
            });
            return contextIndividualList;
        }
        
    }
  • 相关阅读:
    高手详解:sscanf函数的高级用法
    堆排序——BuildHeap和Heapify函数的实现
    递归与动态规划求解最长公共子序列
    分享:crpcut 1.8.4 发布,C++ 的单元测试框架
    团队展示 京拍档 电商运营服务、电子商务服务外包 首家京东代运营电子商务服务平台
    Linux中link,unlink,close,fclose详解
    常用排序算法的c++实现(冒泡,选择,插入,堆,shell,快速,归并 )与sort()对比 coder_xia的专栏 博客频道 CSDN.NET
    CAJ文件转PDF文件方法
    递归与动态规划求解最长公共子序列
    NLP Job 专注自然语言处理&机器学习等领域的求职招聘 | 关注自然语言处理|机器学习|数据挖掘|搜索引擎|计算广告|推荐算法等相关领域的工作机会
  • 原文地址:https://www.cnblogs.com/JimmySeraph/p/12753631.html
Copyright © 2011-2022 走看看