zoukankan      html  css  js  c++  java
  • word转html 和pdf

    今天有个新的需求,就是要把word进行预览,为了实现打印,需要转成pdf或html,在网上找了一些方法,这里做个记录

    首先是转html,看起来挺简单的
     
    首先是两个maven包
     
    <!-- java word文档 转 html文件 -->
    <dependency>
    <groupId>fr.opensagres.xdocreport</groupId>
    <artifactId>fr.opensagres.xdocreport.document</artifactId>
    <version>1.0.5</version>
    </dependency>
    <dependency>
    <groupId>fr.opensagres.xdocreport</groupId>
    <artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId>
    <version>1.0.5</version>
    </dependency>
     
     
    <dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>3.12</version>
    </dependency>
    <dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-scratchpad</artifactId>
    <version>3.12</version>
    </dependency>
     
    然后就是转换demo
    package b2b.cn.util;
     
     
     
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.FileNotFoundException;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.io.InputStream;
    import java.io.OutputStream;
     
    import javax.xml.parsers.DocumentBuilderFactory;
    import javax.xml.parsers.ParserConfigurationException;
    import javax.xml.transform.OutputKeys;
    import javax.xml.transform.Transformer;
    import javax.xml.transform.TransformerException;
    import javax.xml.transform.TransformerFactory;
    import javax.xml.transform.dom.DOMSource;
    import javax.xml.transform.stream.StreamResult;
     
    import org.apache.poi.hwpf.HWPFDocument;
    import org.apache.poi.hwpf.converter.PicturesManager;
    import org.apache.poi.hwpf.converter.WordToHtmlConverter;
    import org.apache.poi.hwpf.usermodel.PictureType;
    import org.apache.poi.xwpf.converter.core.FileImageExtractor;
    import org.apache.poi.xwpf.converter.core.FileURIResolver;
    import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
    import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
    import org.apache.poi.xwpf.usermodel.XWPFDocument;
    import org.junit.Test;
    import org.w3c.dom.Document;
     
    /**
    * word 转换成html
    */
    public class GoHTML {
     
    public static void main(String[] args) {
    try {
    // new GoHTML().Word2003ToHtml(); //doc
     
    new GoHTML().Word2007ToHtml();
     
     
    } catch (IOException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
    }
    }
     
    /**
    * 2007版本word转换成html
    * @throws IOException
    */
    @Test
    public void Word2007ToHtml() throws IOException {
    String filepath = "E:/test/";
    String fileName = "demo.docx";
    String htmlName = "123.html";
    final String file = filepath + fileName;
    File f = new File(file);
    if (!f.exists()) {
    System.out.println("Sorry File does not Exists!");
    } else {
    if (f.getName().endsWith(".docx") || f.getName().endsWith(".DOCX")) {
     
    // 1) 加载word文档生成 XWPFDocument对象
    InputStream in = new FileInputStream(f);
    XWPFDocument document = new XWPFDocument(in);
     
    // 2) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录)
    File imageFolderFile = new File(filepath);
    XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile));
    options.setExtractor(new FileImageExtractor(imageFolderFile));
    options.setIgnoreStylesIfUnused(false);
    options.setFragment(true);
     
    // 3) 将 XWPFDocument转换成XHTML
    OutputStream out = new FileOutputStream(new File(filepath + htmlName));
    XHTMLConverter.getInstance().convert(document, out, options);
     
    //也可以使用字符数组流获取解析的内容
    // ByteArrayOutputStream baos = new ByteArrayOutputStream();
    // XHTMLConverter.getInstance().convert(document, baos, options);
    // String content = baos.toString();
    // System.out.println(content);
    // baos.close();
    } else {
    System.out.println("Enter only MS Office 2007+ files");
    }
    }
    }
     
    /**
    * /**
    * 2003版本word转换成html
    * @throws IOException
    * @throws TransformerException
    * @throws ParserConfigurationException
    */
    @Test
    public void Word2003ToHtml() throws IOException, TransformerException, ParserConfigurationException {
    final String imagepath = "F:/test/image/";//解析时候如果doc文件中有图片 图片会保存在此路径
    String filepath = "F:/test/";
    String fileName = "demo.doc";
    String htmlName = "123.html";
    final String file = filepath + fileName;
    InputStream input = new FileInputStream(new File(file));
    HWPFDocument wordDocument = new HWPFDocument(input);
    WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
    //设置图片存放的位置
    wordToHtmlConverter.setPicturesManager(new PicturesManager() {
    public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
    File imgPath = new File(imagepath);
    if(!imgPath.exists()){//图片目录不存在则创建
    imgPath.mkdirs();
    }
    File file = new File(imagepath + suggestedName);
    try {
    OutputStream os = new FileOutputStream(file);
    os.write(content);
    os.close();
    } catch (FileNotFoundException e) {
    e.printStackTrace();
    } catch (IOException e) {
    e.printStackTrace();
    }
    return imagepath + suggestedName;
    }
    });
     
    //解析word文档
    wordToHtmlConverter.processDocument(wordDocument);
    Document htmlDocument = wordToHtmlConverter.getDocument();
     
    File htmlFile = new File(filepath + htmlName);
    OutputStream outStream = new FileOutputStream(htmlFile);
     
    //也可以使用字符数组流获取解析的内容
    // ByteArrayOutputStream baos = new ByteArrayOutputStream();
    // OutputStream outStream = new BufferedOutputStream(baos);
     
    DOMSource domSource = new DOMSource(htmlDocument);
    StreamResult streamResult = new StreamResult(outStream);
     
    TransformerFactory factory = TransformerFactory.newInstance();
    Transformer serializer = factory.newTransformer();
    serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
    serializer.setOutputProperty(OutputKeys.INDENT, "yes");
    serializer.setOutputProperty(OutputKeys.METHOD, "html");
     
    serializer.transform(domSource, streamResult);
     
    //也可以使用字符数组流获取解析的内容
    // String content = baos.toString();
    // System.out.println(content);
    // baos.close();
    outStream.close();
    }
     
    }
    我只测试了docx ,没有问题
     
     
    但是转pdf出现了一点小麻烦
     
    这个方法是网上很多人都在用的
    <!-- 转pdf -->
    <!-- https://mvnrepository.com/artifact/com.aspose/aspose-words -->
    <dependency>
    <groupId>com.aspose.words</groupId>
    <artifactId>aspose-words-jdk16</artifactId>
    <version>15.8.0</version>
    </dependency>
     
    package b2b.cn.util;
     
    import java.io.File;
    import java.io.FileOutputStream;
    import java.io.InputStream;
     
    import org.junit.Test;
     
    import com.aspose.words.Document;
    import com.aspose.words.License;
    import com.aspose.words.SaveFormat;
    /***
    * 转成pdf工具类
    * 如果注释掉验证 转化成功但是有水印
    * @author Ic055
    *
    */
    public class GoPDF {
     
    public static void main(String[] args) {
    doc2pdf("E:/test/demo.docx");
    }
    public static boolean getLicense() {
    boolean result = false;
    try {
    InputStream is = Test.class.getClassLoader().getResourceAsStream("license.xml"); // license.xml应放在..WebRootWEB-INFclasses路径下
    License aposeLic = new License();
    aposeLic.setLicense(is);
    result = true;
    } catch (Exception e) {
    e.printStackTrace();
    }
    return result;
    }
    public static void doc2pdf(String Address) {
     
    if (!getLicense()) {
    // 验证License 若不验证则转化出的pdf文档会有水印产生 return; }
    return;
     
    }
    try {
    long old = System.currentTimeMillis();
    File file = new File("E:/demo11.pdf"); //新建一个空白pdf文档
    FileOutputStream os = new FileOutputStream(file);
    Document doc = new Document(Address); //Address是将要被转化的word文档
    doc.save(os, SaveFormat.PDF);//全面支持DOC, DOCX, OOXML, RTF HTML, OpenDocument, PDF, EPUB, XPS, SWF 相互转换
    long now = System.currentTimeMillis();
    System.out.println("共耗时:" + ((now - old) / 1000.0) + "秒"); //转化用时
    } catch (Exception e) {
    e.printStackTrace();
    }
     
    }
     
     
     
    }
     
     
     
    license.xml
     
    <License>
    <Data>
    <Products>
    <Product>Aspose.Total for Java</Product>
    <Product>Aspose.Words for Java</Product>
    </Products>
    <EditionType>Enterprise</EditionType>
    <SubscriptionExpiry>20991231</SubscriptionExpiry>
    <LicenseExpiry>20991231</LicenseExpiry>
    <SerialNumber>8bfe198c-7f0c-4ef8-8ff0-acc3237bf0d7</SerialNumber>
    </Data>
    <Signature>sNLLKGMUdF0r8O1kKilWAGdgfs2BvJb/2Xp8p5iuDVfZXmhppo+d0Ran1P9TKdjV4ABwAgKXxJ3jcQTqE/2IRfqwnPf8itN8aFZlV3TJPYeD3yWE7IT55Gz6EijUpC7aKeoohTb4w2fpox58wWoF3SNp6sK6jDfiAUGEHYJ9pjU=</Signature>
    </License>
     
    在网上当的绝大多数教程中 这个文件应该放在WebRoot/WEB-INFO/classes目录下
    但是发现一直找不到这个文件,后来偶然看到,对于maven项目来说,应该放在resouce目录下
     
    还有一个小问题 就是

    <dependency>
    <groupId>com.aspose.words</groupId>
    <artifactId>aspose-words-jdk16</artifactId>
    <version>15.8.0</version>
    </dependency>

    这个可能会报错,所以呢,这个要本地下载jar资源

    aspose-words-15.8.0-jdk16 我这个是在网上找到的资源 然后用eclipse添加到maven仓库就可以用啦

    我放在百度网盘分享给大家

    链接:https://pan.baidu.com/s/1DncAhgqUqfELv193WtTcDQ
    提取码:q41z

    除了用eclipse添加到maven仓库,我见到还有一种处理方式

    • 内置属性:主要有两个常用内置属性——${basedir}表示项目根目录,即包含pom.xml文件的目录;${version}表示项目版本。
     

     
     
     
     
     
     
    路径
     
     

  • 相关阅读:
    linux 文件系统基本结构
    linux bash命令行基本操作
    U盘安装Centos6.2
    linux安装JDK
    linux重启和关闭系统命令
    eclipse安装反编译工具JadClipse
    Linux系统 Centos6 安装
    Linux 发展史
    计算机硬件
    网络 、osi 七层模型、tcp/ip 五层参考
  • 原文地址:https://www.cnblogs.com/Mr-Y1907/p/11263807.html
Copyright © 2011-2022 走看看