zoukankan      html  css  js  c++  java
  • 使用POI将doc文件转换为html

    需要的jar包有:有一些是依赖包,可以使用maven下载

    doc文件转换为html文件

    package com.gsww.sxzz.controller.service;
    
    import org.apache.poi.hwpf.HWPFDocument;
    import org.apache.poi.hwpf.converter.PicturesManager;
    import org.apache.poi.hwpf.converter.WordToHtmlConverter;
    import org.apache.poi.hwpf.usermodel.Picture;
    import org.apache.poi.hwpf.usermodel.PictureType;
    import org.jsoup.Jsoup; 
    import org.w3c.dom.Document;
    
    import javax.xml.parsers.DocumentBuilderFactory;
    import javax.xml.parsers.ParserConfigurationException;
    import javax.xml.transform.OutputKeys;
    import javax.xml.transform.Transformer;
    import javax.xml.transform.TransformerException;
    import javax.xml.transform.TransformerFactory;
    import javax.xml.transform.dom.DOMSource;
    import javax.xml.transform.stream.StreamResult;
    import java.io.*;
    import java.util.List;
    
    /**
     * Created by Carey on 15-2-2.
     */
    public class docTohtml {
    
    
        public static void main(String argv[]) {
            try {
                convert2Html("D:\b.doc","D:\1.html");
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    
        //输出html文件 
        public static void writeFile(String content, String path) {
            FileOutputStream fos = null; 
            BufferedWriter bw = null;
            org.jsoup.nodes.Document doc = Jsoup.parse(content);
            String styleOld=doc.getElementsByTag("style").html();
            //统一字体格式为宋体
            styleOld=styleOld.replaceAll("font-family:.+(?=;\b)", "font-family:SimSun");
            
            doc.getElementsByTag("head").empty();
            doc.getElementsByTag("head").append("<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></meta>");
            doc.getElementsByTag("head").append(" <style type="text/css"></style>");
            doc.getElementsByTag("style").append(styleOld);
            /*正则表达式查询字体内容:font-family:.+(?=;)*/
            System.out.println(content);
            content=doc.html();
            content=content.replace("<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">", "<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></meta>");
            try {
                File file = new File(path);
                fos = new FileOutputStream(file);
                bw = new BufferedWriter(new OutputStreamWriter(fos,"UTF-8"));
                bw.write(content);
            } catch (FileNotFoundException fnfe) {
                fnfe.printStackTrace();
            } catch (IOException ioe) {
                ioe.printStackTrace();
            } finally {
                try {
                    if (bw != null)
                        bw.close();
                    if (fos != null)
                        fos.close();
                } catch (IOException ie) {
                }
            }
        }
    
        //word 转 html 
        public static void convert2Html(String fileName, String outPutFile)
                throws TransformerException, IOException,
                ParserConfigurationException {
    
            HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(fileName));//WordToHtmlUtils.loadDoc(new FileInputStream(inputFile));
             //兼容2007 以上版本
    //        XSSFWorkbook  xssfwork=new XSSFWorkbook(new FileInputStream(fileName));
            WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
                    DocumentBuilderFactory.newInstance().newDocumentBuilder()
                            .newDocument());
            wordToHtmlConverter.setPicturesManager( new PicturesManager()
            {
                public String savePicture( byte[] content,
                                           PictureType pictureType, String suggestedName,
                                           float widthInches, float heightInches )
                {
                    return "test/"+suggestedName;
                }
            } );
            wordToHtmlConverter.processDocument(wordDocument);
            
            //save pictures
            List pics=wordDocument.getPicturesTable().getAllPictures();
            if(pics!=null){
                for(int i=0;i<pics.size();i++){
                    Picture pic = (Picture)pics.get(i);
                    System.out.println();
                    try {
                        pic.writeImageContent(new FileOutputStream("D:/test/"
                                + pic.suggestFullFileName()));
                    } catch (FileNotFoundException e) {
                        e.printStackTrace();
                    }
                }
            }
            Document htmlDocument = wordToHtmlConverter.getDocument();
            
            ByteArrayOutputStream out = new ByteArrayOutputStream();
            DOMSource domSource = new DOMSource(htmlDocument);
           
            StreamResult streamResult = new StreamResult(out);
    
    
            TransformerFactory tf = TransformerFactory.newInstance();
            Transformer serializer = tf.newTransformer();
         
            serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
            serializer.setOutputProperty(OutputKeys.INDENT, "yes");
            serializer.setOutputProperty(OutputKeys.METHOD, "HTML");
            serializer.transform(domSource, streamResult);
            out.close();
            writeFile(new String(out.toByteArray()), outPutFile);
        }
    }

    遇到的问题,当doc转换为html时不会将图像的线条给转换过来。只有在table表格中才可以转换为span标签。如果要作下滑线,可以放一个table的单元格只设定下边框就可以完美转换为html了。

    将html转换为pdf

    package com.gsww.sxzz.controller.service;
    
    
    import com.lowagie.text.pdf.BaseFont;
    import org.xhtmlrenderer.pdf.ITextFontResolver;
    import org.xhtmlrenderer.pdf.ITextRenderer;
    
    import java.io.File;
    import java.io.FileNotFoundException;
    import java.io.FileOutputStream;
    import java.io.OutputStream;
    
    /**
     * Created by Carey on 15-2-2.
     */
    public class htmlToPdf {
    
    
        public boolean convertHtmlToPdf(String inputFile, String outputFile)
                 {
    
            
            try {
                    OutputStream     os = new FileOutputStream(outputFile);
                  ITextRenderer renderer = new ITextRenderer();
                    String url = new File(inputFile).toURI().toURL().toString();
                    renderer.setDocument(url);
                    // 解决中文支持问题
                    ITextFontResolver fontResolver = renderer.getFontResolver();
                    /*fontResolver.addFont("C:\Windows\Fonts\simsunb.ttf", BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED);     
                    *///宋体文件的相对路径
                    fontResolver.addFont("C:\Windows\Fonts\simsun.ttc", BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED);     
                   
                    renderer.getSharedContext().setBaseURL("file:/D:/");
                    renderer.layout();
                    renderer.createPDF(os);
                    os.flush();
                    os.close();
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
          
            return true;
        }
    
    
         public   static  void  main(String [] args){
             htmlToPdf html2Pdf =new htmlToPdf();
             try {
                 html2Pdf.convertHtmlToPdf("D:\1.html","D:\index.pdf");
             } catch (Exception e) {
                 e.printStackTrace();
             }
         }
    }
  • 相关阅读:
    define vs const vs enum
    解决Ubuntu 14.04 LTS 浏览网页速度慢的问题
    C语言两种产生矩阵的方法
    GTK 添加图标
    Unix Socket 端口 reuse
    Linux GTK Hello,World
    插件使用记录
    原型链和new
    each函数循环数据表示列举,列举循环的时候添加dom的方法
    字体圆润属性的使用-webkit-font-smoothing: antialiased
  • 原文地址:https://www.cnblogs.com/gynbk/p/7230849.html
Copyright © 2011-2022 走看看