zoukankan      html  css  js  c++  java
  • java word文档 转 html文件

    一、简介
      一般word文件后缀有doc、docx两种。docx是office word 2007以及以后版本文档的扩展名;doc是office word 2003文档保存的扩展名。对于这两种格式的word转换成html需要使用不同的方法。
    对于docx格式的文档使用xdocreport进行转换。依赖如下:

    <dependency>
        <groupId>fr.opensagres.xdocreport</groupId>
        <artifactId>fr.opensagres.xdocreport.document</artifactId>
        <version>1.0.5</version>
    </dependency>
    <dependency>  
        <groupId>fr.opensagres.xdocreport</groupId>  
        <artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId>  
        <version>1.0.5</version>  
    </dependency>

    对于docx格式的文档使用poi进行转换。依赖如下:

    <dependency>
        <groupId>org.apache.poi</groupId>
        <artifactId>poi</artifactId>
        <version>3.12</version>
    </dependency>
    <dependency>
        <groupId>org.apache.poi</groupId>
        <artifactId>poi-scratchpad</artifactId>
        <version>3.12</version>
    </dependency>

    二:示例
      代码示例如下:

      1 package com.test.word;
      2 
      3 import java.io.File;
      4 import java.io.FileInputStream;
      5 import java.io.FileNotFoundException;
      6 import java.io.FileOutputStream;
      7 import java.io.IOException;
      8 import java.io.InputStream;
      9 import java.io.OutputStream;
     10 
     11 import javax.xml.parsers.DocumentBuilderFactory;
     12 import javax.xml.parsers.ParserConfigurationException;
     13 import javax.xml.transform.OutputKeys;
     14 import javax.xml.transform.Transformer;
     15 import javax.xml.transform.TransformerException;
     16 import javax.xml.transform.TransformerFactory;
     17 import javax.xml.transform.dom.DOMSource;
     18 import javax.xml.transform.stream.StreamResult;
     19 
     20 import org.apache.poi.hwpf.HWPFDocument;
     21 import org.apache.poi.hwpf.converter.PicturesManager;
     22 import org.apache.poi.hwpf.converter.WordToHtmlConverter;
     23 import org.apache.poi.hwpf.usermodel.PictureType;
     24 import org.apache.poi.xwpf.converter.core.FileImageExtractor;
     25 import org.apache.poi.xwpf.converter.core.FileURIResolver;
     26 import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
     27 import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
     28 import org.apache.poi.xwpf.usermodel.XWPFDocument;
     29 import org.junit.Test;
     30 import org.w3c.dom.Document;
     31 
     32 /**
     33  * word 转换成html
     34  */
     35 public class WordToHtml {
     36     
     37     /**
     38      * 2007版本word转换成html
     39      * @throws IOException
     40      */
     41     @Test 
     42     public void Word2007ToHtml() throws IOException {
     43         String filepath = "C:/test/";
     44         String fileName = "滕王阁序2007.docx";
     45         String htmlName = "滕王阁序2007.html";
     46         final String file = filepath + fileName;
     47         File f = new File(file);  
     48         if (!f.exists()) {  
     49             System.out.println("Sorry File does not Exists!");  
     50         } else {  
     51             if (f.getName().endsWith(".docx") || f.getName().endsWith(".DOCX")) {  
     52                   
     53                 // 1) 加载word文档生成 XWPFDocument对象  
     54                 InputStream in = new FileInputStream(f);  
     55                 XWPFDocument document = new XWPFDocument(in);  
     56   
     57                 // 2) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录)  
     58                 File imageFolderFile = new File(filepath);  
     59                 XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile));  
     60                 options.setExtractor(new FileImageExtractor(imageFolderFile));  
     61                 options.setIgnoreStylesIfUnused(false);  
     62                 options.setFragment(true);  
     63                   
     64                 // 3) 将 XWPFDocument转换成XHTML  
     65                 OutputStream out = new FileOutputStream(new File(filepath + htmlName));  
     66                 XHTMLConverter.getInstance().convert(document, out, options);  
     67                 
     68                 //也可以使用字符数组流获取解析的内容
     69 //                ByteArrayOutputStream baos = new ByteArrayOutputStream(); 
     70 //                XHTMLConverter.getInstance().convert(document, baos, options);  
     71 //                String content = baos.toString();
     72 //                System.out.println(content);
     73 //                 baos.close();
     74             } else {  
     75                 System.out.println("Enter only MS Office 2007+ files");  
     76             }  
     77         }  
     78     }  
     79     
     80     /**
     81      * /**
     82      * 2003版本word转换成html
     83      * @throws IOException
     84      * @throws TransformerException
     85      * @throws ParserConfigurationException
     86      */
     87     @Test 
     88     public void Word2003ToHtml() throws IOException, TransformerException, ParserConfigurationException {
     89         String filepath = "C:/test/";
     90         final String imagepath = "C:/test/image/";
     91         String fileName = "滕王阁序2003.doc";
     92         String htmlName = "滕王阁序2003.html";
     93         final String file = filepath + fileName;
     94         InputStream input = new FileInputStream(new File(file));
     95         HWPFDocument wordDocument = new HWPFDocument(input);
     96         WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
     97         //设置图片存放的位置
     98         wordToHtmlConverter.setPicturesManager(new PicturesManager() {
     99             public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
    100                 File imgPath = new File(imagepath);
    101                 if(!imgPath.exists()){//图片目录不存在则创建
    102                     imgPath.mkdirs();
    103                 }
    104                 File file = new File(imagepath + suggestedName);
    105                 try {
    106                     OutputStream os = new FileOutputStream(file);
    107                     os.write(content);
    108                     os.close();
    109                 } catch (FileNotFoundException e) {
    110                     e.printStackTrace();
    111                 } catch (IOException e) {
    112                     e.printStackTrace();
    113                 }
    114                 return imagepath + suggestedName;
    115             }
    116         });
    117         
    118         //解析word文档
    119         wordToHtmlConverter.processDocument(wordDocument);
    120         Document htmlDocument = wordToHtmlConverter.getDocument();
    121         
    122         File htmlFile = new File(filepath + htmlName);
    123         OutputStream outStream = new FileOutputStream(htmlFile);
    124         
    125         //也可以使用字符数组流获取解析的内容
    126 //        ByteArrayOutputStream baos = new ByteArrayOutputStream(); 
    127 //        OutputStream outStream = new BufferedOutputStream(baos);
    128 
    129         DOMSource domSource = new DOMSource(htmlDocument);
    130         StreamResult streamResult = new StreamResult(outStream);
    131 
    132         TransformerFactory factory = TransformerFactory.newInstance();
    133         Transformer serializer = factory.newTransformer();
    134         serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
    135         serializer.setOutputProperty(OutputKeys.INDENT, "yes");
    136         serializer.setOutputProperty(OutputKeys.METHOD, "html");
    137         
    138         serializer.transform(domSource, streamResult);
    139 
    140         //也可以使用字符数组流获取解析的内容
    141 //        String content = baos.toString();
    142 //        System.out.println(content);
    143 //        baos.close();
    144         outStream.close();
    145     }
    146 }

      运行生存文件结果如下:

      

       

  • 相关阅读:
    2-4 递增链表的插入 链表
    KMPnext数组自看
    Shortest Prefixes POJ
    Xor Sum HDU
    Immediate Decodability HDU
    Repository HDU
    "strcmp()" Anyone? UVA
    Remember the Word UVALive
    A Magic Lamp HDU
    Check Corners HDU
  • 原文地址:https://www.cnblogs.com/always-online/p/4800131.html
Copyright © 2011-2022 走看看