zoukankan      html  css  js  c++  java
  • POI实现Word转HTML文件

    package cn.wgd.util;
    
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.FileOutputStream;
    import java.io.IOException;
    
    import javax.xml.parsers.ParserConfigurationException;
    import javax.xml.transform.OutputKeys;
    import javax.xml.transform.Transformer;
    import javax.xml.transform.TransformerException;
    import javax.xml.transform.TransformerFactory;
    import javax.xml.transform.dom.DOMSource;
    import javax.xml.transform.stream.StreamResult;
    
    import org.apache.poi.hwpf.HWPFDocumentCore;
    import org.apache.poi.hwpf.converter.AbstractWordUtils;
    import org.apache.poi.hwpf.converter.WordToHtmlConverter;
    import org.apache.poi.util.XMLHelper;
    import org.apache.poi.xwpf.usermodel.XWPFDocument;
    import org.w3c.dom.Document;
    import org.xml.sax.SAXException;
    
    import fr.opensagres.poi.xwpf.converter.core.IXWPFConverter;
    import fr.opensagres.poi.xwpf.converter.core.ImageManager;
    import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
    import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
    
    /**
     * @author Kevin 2018-3-14
     * 
     * 将word,pdf等文件转为html,用于附件预览!
     * 
     * 圖片處理https://www.cnblogs.com/feiruo/p/5924514.html
     * 
     * 本例程需要jar包:poi(poi3.17)相关jar包外,
     * fr.opensagres.poi.xwpf.converter.core-2.0.1.jar
     * fr.opensagres.poi.xwpf.converter.xhtml-2.0.1.jar
     * fr.opensagres.xdocreport.core-2.0.1.jar
     * ooxml-schemas-1.3.jar等
     * 
     * 注:此方法为简单实现,如word需要更多样式处理,还需要自行实现!
     *
     */
    public class ConvertWord2HtmlUtil {
    
        public static void main(String[] args) throws IOException, ParserConfigurationException, TransformerException, SAXException {
            String path = "D:\testfile2html\test.docx";
            String descPath = "D:\testfile2html\test.html";
            String imagePath = "D:\testfile2html";
            word2007ToHtml(path, descPath, imagePath);
        }
    
        /**
         * 处理doc文件转HTML,此方法参考:org.apache.poi.hwpf.converter.WordToHtmlConverter.main()
         * @param path
         * @param descPath
         * @throws IOException
         * @throws ParserConfigurationException
         * @throws TransformerException
         */
        public static void word95T2007ToHtml(String path, String descPath) 
                throws IOException, ParserConfigurationException, TransformerException{
            if(path == null)
                throw new NullPointerException("路径不能为空!");
    
            System.out.println( "Converting " + path );
            System.out.println( "Saving output to " + descPath );
    
            Document doc = ConvertWord2HtmlUtil.process(new File(path));
    
            DOMSource domSource = new DOMSource( doc );
            StreamResult streamResult = new StreamResult(new File(descPath));
    
            TransformerFactory tf = TransformerFactory.newInstance();
            Transformer serializer = tf.newTransformer();
            // TODO set encoding from a command argument
            serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" );
            serializer.setOutputProperty( OutputKeys.INDENT, "yes" );
            serializer.setOutputProperty( OutputKeys.METHOD, "html" );
            serializer.transform( domSource, streamResult );
        }
    
        /**
         * 
         * 此方法来源于:org.apache.poi.hwpf.converter.WordToHtmlConverter
         * @param docFile
         * @return
         * @throws IOException
         * @throws ParserConfigurationException
         */
        static Document process( File docFile ) throws IOException, ParserConfigurationException
        {
            final HWPFDocumentCore wordDocument = AbstractWordUtils.loadDoc( docFile );
            WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
                    XMLHelper.getDocumentBuilderFactory().newDocumentBuilder()
                            .newDocument() );
            wordToHtmlConverter.processDocument( wordDocument );
            return wordToHtmlConverter.getDocument();
        }
    
        /**
         * @param path 源文件路径(doc or docx)
         * @param descPath 转化后的文件路径(html)
         * @param imagePath 图片存放地址(本地址默认为html文件同路径)
         * @throws IOException 
         * @throws ParserConfigurationException
         * @throws TransformerException
         * @throws SAXException
         */
        public static void word2007ToHtml(String path, String descPath, String imagePath) 
                throws IOException, ParserConfigurationException, TransformerException, SAXException{
            if(path == null){
                throw new NullPointerException("路径不能为空!");
            }
            File sourceFile = new File(path);
            if(!sourceFile.exists()){
                System.out.println("用户文件不存在!");
                return;
            }else{
                if(path.endsWith(".docx") || path.endsWith(".DOCX")){
                    XWPFDocument document = new XWPFDocument(new FileInputStream(path));
                    //html转化器
                    IXWPFConverter<XHTMLOptions> converter = XHTMLConverter.getInstance();
                    //html属性器
                    XHTMLOptions options = XHTMLOptions.create();
                    //图片处理,第二个参数为html文件同级目录下,否则图片找不到。
                    ImageManager imageManager = new ImageManager(new File(imagePath), "image");
                    options.setImageManager(imageManager);
    
                    converter.convert(document, new FileOutputStream(descPath), options);
                }else{
                    word95T2007ToHtml(path, descPath);
                }
            }
        }
    }
    
  • 相关阅读:
    从针对接口编程到依赖注入
    DataRow 数组转化成DataTable
    Math 类的方法概要
    .net控件
    字符串反转
    DataTable
    Enabled设置为False时,前景色和背景色也不改变的TextBox 并居中
    C# 四舍五入 (解决四舍六入五成双的问题)
    查询最后一条数据
    C# toString()
  • 原文地址:https://www.cnblogs.com/Kevin-1992/p/12608382.html
Copyright © 2011-2022 走看看