zoukankan html css js c++ java
Poi读取word(doc)文档的文本或图片

package org.jimmy.studyproject.util;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;

import com.google.common.base.CharMatcher;
import com.google.common.collect.Lists;

@SuppressWarnings({"resource", "unused"})
public class Utils {
    
    public static final String DIR_PATH = "D:/Resume/TopicSolutions/SourceFile/";
    public static final String PICTURE_DIR_PATH = "D:/Resume/TopicSolutions/Picture/";
    public static String unicodeRegStr = "[a-zA-Z0-9\u4e00-\u9fa5";
    public static final String PUNCTUATION = "\u3001,\u03c1,\u3002,\uff08,\u03c8,\uff09,\u300a,\u300b,\uff0b,\uff0c,\uff0d,\uff0e,\u33d1,\u2014,\u00d7,\u2019,\uff1a,\u005b,\uff1c,\u005d,\uff1d,\u221e,\uff1e,\uff1f,\u0060,\u2264,\u0028,\u0029,\u002b,\u222b,\u002d,\u002e,\u002f,\u00b1,\u03b8,\u007b,\u043b,\u007c,\u003c,\u003d,\u007d,\u003e";
    public static Pattern unicodeReg = null;

    public static void main(String[] args){
        try {
            String[] punctuationArr = PUNCTUATION.split(",");
            if(punctuationArr != null){
                for(String punctuation : punctuationArr){
                    unicodeRegStr += punctuation;
                }
            }
            unicodeRegStr += "]";
            System.out.println(unicodeRegStr);
            unicodeReg = Pattern.compile(unicodeRegStr);
            List<String> contextList = readWordFile();
            contextList = readSourceWordFile();
            writeWordFile(contextList);
            /*String text = getWordAndStyle();
            System.out.println(text);*/
            /*boolean flag = UNICODE_REG.matcher("").matches();
            System.out.println(flag);*/
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    
    public static void writeWordFile(List<String> contextList){
        if(contextList != null){
            contextList.forEach(c -> System.out.print(c));
        }
    }
    
    //读取每个字符样式
    public static String getWordAndStyle() throws Exception{
        String text = "";
        File dir = new File(DIR_PATH);
        if(dir != null && dir.isDirectory()){
            File[] fileArr = dir.listFiles();
            for(File file : fileArr){
                CharacterRun cr = null;
                FileInputStream in = new FileInputStream(file.getAbsolutePath()); 
                HWPFDocument doc = new HWPFDocument(in); 
                int length = doc.characterLength();
                PicturesTable pictruesTable = doc.getPicturesTable();
                String picFilePath = null;
                for (int i = 0; i < length; i++){
                    Range range = new Range(i, i + 1, doc);
                    cr = range.getCharacterRun(0);
                    if(pictruesTable.hasPicture(cr)){
                        //获取图片路径
                        picFilePath = readPicture(pictruesTable, cr);
                    }
                }
            }
        }
        return text;
    }
    
    public static String readPicture(PicturesTable pTable, CharacterRun cr) throws Exception{
        Picture pic = pTable.extractPicture(cr, false);
        String pictureFileName = pic.suggestFullFileName();
        String pictureFilePath = PICTURE_DIR_PATH + pictureFileName;
        OutputStream out = new FileOutputStream(new File(pictureFilePath));
        pic.writeImageContent(out);
        return pictureFilePath;
    }
    
    /**
     * Detail: 读取源文件,过滤乱码
     * Author: ラピスラズリ(Dawn)
     * Date: 2020年4月22日 下午5:25:17
     */
    public static <T>List<String> readWordFile() throws Exception {
        List<String> contextList = Lists.newArrayList();
        List<String> contextReplacedList = new ArrayList<String>();
        List<String> contextIndividualList = new ArrayList<String>();
        File dir = new File(DIR_PATH);
        if(dir != null && dir.isDirectory()){
            File[] fileArr = dir.listFiles();
            for(File file : fileArr){
                InputStream stream = new FileInputStream(file);
                String path = file.getAbsolutePath();
                if (path.endsWith(".doc")) {
                    HWPFDocument document = new HWPFDocument(stream);
                    WordExtractor extractor = new WordExtractor(document);
                    String[] contextArray = extractor.getParagraphText();
                    Arrays.asList(contextArray).forEach(context -> contextList.add(CharMatcher.whitespace().removeFrom(context)));
                    extractor.close();
                    document.close();
                } else if (path.endsWith(".docx")) {
                    XWPFDocument document = new XWPFDocument(stream).getXWPFDocument();
                    List<XWPFParagraph> paragraphList = document.getParagraphs();
                    paragraphList.forEach(paragraph -> contextList.add(CharMatcher.whitespace().removeFrom(paragraph.getParagraphText())));
                    document.close();
                } 
                if(stream != null){
                    stream.close();
                } 
            }
        }
        contextList.forEach(c -> contextReplacedList.add(c.replaceAll("EMBEDEquation.3", "")));
        contextReplacedList.forEach(c -> {
            for(int i = 0; i < c.length(); i++){
                String currentWord = c.charAt(i) + "";
                if(unicodeReg.matcher(currentWord).matches()){
                    contextIndividualList.add(currentWord);
                }
            }
        });
        return contextIndividualList;
    }
    
    /**
     * Detail: 读取源文件,有乱码
     * Author: ラピスラズリ(Dawn)
     * Date: 2020年4月22日 下午5:24:51
     */
    public static <T>List<String> readSourceWordFile() throws Exception {
        List<String> contextList = Lists.newArrayList();
        List<String> contextReplacedList = new ArrayList<String>();
        List<String> contextIndividualList = new ArrayList<String>();
        File dir = new File(DIR_PATH);
        if(dir != null && dir.isDirectory()){
            File[] fileArr = dir.listFiles();
            for(File file : fileArr){
                InputStream stream = new FileInputStream(file);
                String path = file.getAbsolutePath();
                if (path.endsWith(".doc")) {
                    HWPFDocument document = new HWPFDocument(stream);
                    WordExtractor extractor = new WordExtractor(document);
                    String[] contextArray = extractor.getParagraphText();
                    Arrays.asList(contextArray).forEach(context -> contextList.add(CharMatcher.whitespace().removeFrom(context)));
                    extractor.close();
                    document.close();
                } else if (path.endsWith(".docx")) {
                    XWPFDocument document = new XWPFDocument(stream).getXWPFDocument();
                    List<XWPFParagraph> paragraphList = document.getParagraphs();
                    paragraphList.forEach(paragraph -> contextList.add(CharMatcher.whitespace().removeFrom(paragraph.getParagraphText())));
                    document.close();
                } 
                if(stream != null){
                    stream.close();
                } 
            }
        }
        contextList.forEach(c -> contextReplacedList.add(c.replaceAll("EMBEDEquation.3", "")));
        contextReplacedList.forEach(c -> {
            for(int i = 0; i < c.length(); i++){
                String currentWord = c.charAt(i) + "";
                contextIndividualList.add(currentWord);
            }
        });
        return contextIndividualList;
    }
    
}
查看全文
相关阅读:
操作datetable 里面查出来的某个字段
 C# 字符串去重还有去除最后一位逗号。
C# .net 调用ERP接口
 视图下拉列表接收控制器传来的值，并选中下拉类表中该值相对应的选项（新手笔记，请各位大神指教）
MVC5控制器传值的三种方式（ViewData，ViewBag，TempData），刚刚学习MVC5的新手，希望各位大神多多指教
 c++模板之SFINAE
c++头文件包含问题
 成员函数指针有多态的效果吗?
emacs基本操作
 在c++中用function与bind实现委托
原文地址：https://www.cnblogs.com/JimmySeraph/p/12753631.html