zoukankan      html  css  js  c++  java
  • Poi读取word(doc)文档的文本或图片

    package org.jimmy.studyproject.util;
    
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.FileOutputStream;
    import java.io.InputStream;
    import java.io.OutputStream;
    import java.util.ArrayList;
    import java.util.Arrays;
    import java.util.List;
    import java.util.regex.Pattern;
    
    import org.apache.poi.hwpf.HWPFDocument;
    import org.apache.poi.hwpf.extractor.WordExtractor;
    import org.apache.poi.hwpf.model.PicturesTable;
    import org.apache.poi.hwpf.usermodel.CharacterRun;
    import org.apache.poi.hwpf.usermodel.Picture;
    import org.apache.poi.hwpf.usermodel.Range;
    import org.apache.poi.xwpf.usermodel.XWPFDocument;
    import org.apache.poi.xwpf.usermodel.XWPFParagraph;
    
    import com.google.common.base.CharMatcher;
    import com.google.common.collect.Lists;
    
    @SuppressWarnings({"resource", "unused"})
    public class Utils {
        
        public static final String DIR_PATH = "D:/Resume/TopicSolutions/SourceFile/";
        public static final String PICTURE_DIR_PATH = "D:/Resume/TopicSolutions/Picture/";
        public static String unicodeRegStr = "[a-zA-Z0-9\u4e00-\u9fa5";
        public static final String PUNCTUATION = "\u3001,\u03c1,\u3002,\uff08,\u03c8,\uff09,\u300a,\u300b,\uff0b,\uff0c,\uff0d,\uff0e,\u33d1,\u2014,\u00d7,\u2019,\uff1a,\u005b,\uff1c,\u005d,\uff1d,\u221e,\uff1e,\uff1f,\u0060,\u2264,\u0028,\u0029,\u002b,\u222b,\u002d,\u002e,\u002f,\u00b1,\u03b8,\u007b,\u043b,\u007c,\u003c,\u003d,\u007d,\u003e";
        public static Pattern unicodeReg = null;
    
        public static void main(String[] args){
            try {
                String[] punctuationArr = PUNCTUATION.split(",");
                if(punctuationArr != null){
                    for(String punctuation : punctuationArr){
                        unicodeRegStr += punctuation;
                    }
                }
                unicodeRegStr += "]";
                System.out.println(unicodeRegStr);
                unicodeReg = Pattern.compile(unicodeRegStr);
                List<String> contextList = readWordFile();
                contextList = readSourceWordFile();
                writeWordFile(contextList);
                /*String text = getWordAndStyle();
                System.out.println(text);*/
                /*boolean flag = UNICODE_REG.matcher("").matches();
                System.out.println(flag);*/
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        
        public static void writeWordFile(List<String> contextList){
            if(contextList != null){
                contextList.forEach(c -> System.out.print(c));
            }
        }
        
        //读取每个字符样式
        public static String getWordAndStyle() throws Exception{
            String text = "";
            File dir = new File(DIR_PATH);
            if(dir != null && dir.isDirectory()){
                File[] fileArr = dir.listFiles();
                for(File file : fileArr){
                    CharacterRun cr = null;
                    FileInputStream in = new FileInputStream(file.getAbsolutePath()); 
                    HWPFDocument doc = new HWPFDocument(in); 
                    int length = doc.characterLength();
                    PicturesTable pictruesTable = doc.getPicturesTable();
                    String picFilePath = null;
                    for (int i = 0; i < length; i++){
                        Range range = new Range(i, i + 1, doc);
                        cr = range.getCharacterRun(0);
                        if(pictruesTable.hasPicture(cr)){
                            //获取图片路径
                            picFilePath = readPicture(pictruesTable, cr);
                        }
                    }
                }
            }
            return text;
        }
        
        public static String readPicture(PicturesTable pTable, CharacterRun cr) throws Exception{
            Picture pic = pTable.extractPicture(cr, false);
            String pictureFileName = pic.suggestFullFileName();
            String pictureFilePath = PICTURE_DIR_PATH + pictureFileName;
            OutputStream out = new FileOutputStream(new File(pictureFilePath));
            pic.writeImageContent(out);
            return pictureFilePath;
        }
        
        /**
         * Detail: 读取源文件,过滤乱码
         * Author: ラピスラズリ(Dawn)
         * Date: 2020年4月22日 下午5:25:17
         */
        public static <T>List<String> readWordFile() throws Exception {
            List<String> contextList = Lists.newArrayList();
            List<String> contextReplacedList = new ArrayList<String>();
            List<String> contextIndividualList = new ArrayList<String>();
            File dir = new File(DIR_PATH);
            if(dir != null && dir.isDirectory()){
                File[] fileArr = dir.listFiles();
                for(File file : fileArr){
                    InputStream stream = new FileInputStream(file);
                    String path = file.getAbsolutePath();
                    if (path.endsWith(".doc")) {
                        HWPFDocument document = new HWPFDocument(stream);
                        WordExtractor extractor = new WordExtractor(document);
                        String[] contextArray = extractor.getParagraphText();
                        Arrays.asList(contextArray).forEach(context -> contextList.add(CharMatcher.whitespace().removeFrom(context)));
                        extractor.close();
                        document.close();
                    } else if (path.endsWith(".docx")) {
                        XWPFDocument document = new XWPFDocument(stream).getXWPFDocument();
                        List<XWPFParagraph> paragraphList = document.getParagraphs();
                        paragraphList.forEach(paragraph -> contextList.add(CharMatcher.whitespace().removeFrom(paragraph.getParagraphText())));
                        document.close();
                    } 
                    if(stream != null){
                        stream.close();
                    } 
                }
            }
            contextList.forEach(c -> contextReplacedList.add(c.replaceAll("EMBEDEquation.3", "")));
            contextReplacedList.forEach(c -> {
                for(int i = 0; i < c.length(); i++){
                    String currentWord = c.charAt(i) + "";
                    if(unicodeReg.matcher(currentWord).matches()){
                        contextIndividualList.add(currentWord);
                    }
                }
            });
            return contextIndividualList;
        }
        
        /**
         * Detail: 读取源文件,有乱码
         * Author: ラピスラズリ(Dawn)
         * Date: 2020年4月22日 下午5:24:51
         */
        public static <T>List<String> readSourceWordFile() throws Exception {
            List<String> contextList = Lists.newArrayList();
            List<String> contextReplacedList = new ArrayList<String>();
            List<String> contextIndividualList = new ArrayList<String>();
            File dir = new File(DIR_PATH);
            if(dir != null && dir.isDirectory()){
                File[] fileArr = dir.listFiles();
                for(File file : fileArr){
                    InputStream stream = new FileInputStream(file);
                    String path = file.getAbsolutePath();
                    if (path.endsWith(".doc")) {
                        HWPFDocument document = new HWPFDocument(stream);
                        WordExtractor extractor = new WordExtractor(document);
                        String[] contextArray = extractor.getParagraphText();
                        Arrays.asList(contextArray).forEach(context -> contextList.add(CharMatcher.whitespace().removeFrom(context)));
                        extractor.close();
                        document.close();
                    } else if (path.endsWith(".docx")) {
                        XWPFDocument document = new XWPFDocument(stream).getXWPFDocument();
                        List<XWPFParagraph> paragraphList = document.getParagraphs();
                        paragraphList.forEach(paragraph -> contextList.add(CharMatcher.whitespace().removeFrom(paragraph.getParagraphText())));
                        document.close();
                    } 
                    if(stream != null){
                        stream.close();
                    } 
                }
            }
            contextList.forEach(c -> contextReplacedList.add(c.replaceAll("EMBEDEquation.3", "")));
            contextReplacedList.forEach(c -> {
                for(int i = 0; i < c.length(); i++){
                    String currentWord = c.charAt(i) + "";
                    contextIndividualList.add(currentWord);
                }
            });
            return contextIndividualList;
        }
        
    }
  • 相关阅读:
    jdbc概述
    MongoDB(三):数据库操作、集合操作
    MongoDB(二):在Windows环境安装MongoDB
    MongoDB(一):NoSQL简介、MongoDB简介
    python基础(36):pymysql模块
    Web前端基础(19):jQuery基础(六)
    Web前端基础(18):jQuery基础(五)
    Web前端基础(17):jQuery基础(四)
    Web前端基础(16):jQuery基础(三)
    Web前端基础(15):jQuery基础(二)
  • 原文地址:https://www.cnblogs.com/JimmySeraph/p/12753631.html
Copyright © 2011-2022 走看看