zoukankan      html  css  js  c++  java
  • Java实现Word/Pdf/TXT转html

    引言:

        最近公司在做一个教育培训学习及在线考试的项目,本人主要从事网络课程模块,主要做课程分类,课程,课件的创建及在线学习和统计的功能,因为课件涉及到多种类型,像视频,音频,图文,外部链接及文档类型.其中就涉及到一个问题,就是文档型课件课程在网页上的展示和学习问题,因为要在线统计学习的课程,学习的人员,学习的时长,所以不能像传统做法将文档下载到本地学习,那样就不受系统控制了,所以最终的方案是,在上传文档型课件的时候,将其文件对应的转换成HTML文件,以便在网页上能够浏览学习

     下边主要针对word,pdf和txt文本文件进行转换

    一:Java实现将word转换为html

       1:引入依赖

     1 <dependency>
     2   <groupId>fr.opensagres.xdocreport</groupId>
     3   <artifactId>fr.opensagres.xdocreport.document</artifactId>
     4   <version>1.0.5</version>
     5 </dependency>
     6 <dependency> 
     7   <groupId>fr.opensagres.xdocreport</groupId> 
     8   <artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId> 
     9   <version>1.0.5</version> 
    10 </dependency>
    11   <dependency>
    12   <groupId>org.apache.poi</groupId>
    13   <artifactId>poi</artifactId>
    14   <version>3.12</version>
    15 </dependency>
    16 <dependency>
    17   <groupId>org.apache.poi</groupId>
    18   <artifactId>poi-scratchpad</artifactId>
    19   <version>3.12</version>
    20 </dependency>

      2:代码demo

      1 package com.svse.controller;
      2 
      3 import javax.xml.parsers.DocumentBuilderFactory;
      4 import javax.xml.parsers.ParserConfigurationException;
      5 import javax.xml.transform.OutputKeys;
      6 import javax.xml.transform.Transformer;
      7 import javax.xml.transform.TransformerException;
      8 import javax.xml.transform.TransformerFactory;
      9 import javax.xml.transform.dom.DOMSource;
     10 import javax.xml.transform.stream.StreamResult;
     11 
     12 import org.apache.poi.hwpf.HWPFDocument;
     13 import org.apache.poi.hwpf.converter.PicturesManager;
     14 import org.apache.poi.hwpf.converter.WordToHtmlConverter;
     15 import org.apache.poi.hwpf.usermodel.PictureType;
     16 import org.apache.poi.xwpf.converter.core.BasicURIResolver;
     17 import org.apache.poi.xwpf.converter.core.FileImageExtractor;
     18 import org.apache.poi.xwpf.converter.core.FileURIResolver;
     19 import org.apache.poi.xwpf.converter.core.IURIResolver;
     20 import org.apache.poi.xwpf.converter.core.IXWPFConverter;
     21 import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
     22 import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
     23 import org.apache.poi.xwpf.usermodel.XWPFDocument;
     24 /**
     25  * word 转换成html
     26  */
     27 public class TestWordToHtml {
     28 
     29     public static  final String STORAGEPATH="C://works//files//";
     30     public static  final String IP="192.168.30.222";
     31     public static  final String PORT="8010";
     32     public static void main(String[] args) throws IOException, TransformerException, ParserConfigurationException {
     33         TestWordToHtml wt=new TestWordToHtml();
     34         //wt.Word2003ToHtml("甲骨文考证.doc");
     35         wt.Word2007ToHtml("甲骨文考证.docx");
     36 
     37     }
     38       
     39      /**
     40      * 2003版本word转换成html
     41      * @throws IOException
     42      * @throws TransformerException
     43      * @throws ParserConfigurationException
     44      */
     45     public void Word2003ToHtml(String fileName) throws IOException, TransformerException, ParserConfigurationException {
     46        
     47         final String imagepath = STORAGEPATH+"fileImage/";//解析时候如果doc文件中有图片  图片会保存在此路径
     48         final String strRanString=getRandomNum();
     49         String filepath =STORAGEPATH;
     50         String htmlName =fileName.substring(0, fileName.indexOf("."))+ "2003.html";
     51         final String file = filepath + fileName;
     52         InputStream input = new FileInputStream(new File(file));
     53         HWPFDocument wordDocument = new HWPFDocument(input);
     54         WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
     55         //设置图片存放的位置
     56         wordToHtmlConverter.setPicturesManager(new PicturesManager() {
     57             public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
     58                 File imgPath = new File(imagepath);
     59                 if(!imgPath.exists()){//图片目录不存在则创建
     60                     imgPath.mkdirs();
     61                 }
     62                 
     63                 File file = new File(imagepath +strRanString+suggestedName);
     64                 try {
     65                     OutputStream os = new FileOutputStream(file);
     66                     os.write(content);
     67                     os.close();
     68                 } catch (FileNotFoundException e) {
     69                     e.printStackTrace();
     70                 } catch (IOException e) {
     71                     e.printStackTrace();
     72                 }
     73                 
     74                 return  "http://"+IP+":"+PORT+"//uploadFile/fileImage/"+strRanString+suggestedName;
     75                // return imagepath +strRanString+suggestedName;
     76             }
     77         });
     78         
     79         //解析word文档
     80         wordToHtmlConverter.processDocument(wordDocument);
     81         Document htmlDocument = wordToHtmlConverter.getDocument();
     82         
     83         File htmlFile = new File(filepath +strRanString+htmlName);
     84         OutputStream outStream = new FileOutputStream(htmlFile);
     85         
     86 
     87         DOMSource domSource = new DOMSource(htmlDocument);
     88         StreamResult streamResult = new StreamResult(outStream);
     89 
     90         TransformerFactory factory = TransformerFactory.newInstance();
     91         Transformer serializer = factory.newTransformer();
     92         serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
     93         serializer.setOutputProperty(OutputKeys.INDENT, "yes");
     94         serializer.setOutputProperty(OutputKeys.METHOD, "html");
     95         
     96         serializer.transform(domSource, streamResult);
     97         outStream.close();
     98         
     99         System.out.println("生成html文件路径:"+ "http://"+IP+":"+PORT+"//uploadFile/"+strRanString+htmlName);
    100     }
    101 
    102     /**
    103      * 2007版本word转换成html
    104      * @throws IOException
    105      */
    106     public void Word2007ToHtml(String fileName) throws IOException {
    107         
    108        final String strRanString=getRandomNum();
    109         
    110         String filepath = STORAGEPATH+strRanString;
    111         String htmlName =fileName.substring(0, fileName.indexOf("."))+ "2007.html";
    112         File f = new File(STORAGEPATH+fileName);  
    113         if (!f.exists()) {  
    114             System.out.println("Sorry File does not Exists!");  
    115         } else {  
    116             if (f.getName().endsWith(".docx") || f.getName().endsWith(".DOCX")) {  
    117                 try {
    118                     // 1) 加载word文档生成 XWPFDocument对象  
    119                     InputStream in = new FileInputStream(f);  
    120                     XWPFDocument document = new XWPFDocument(in);  
    121       
    122                     // 2) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录)  
    123                     File imageFolderFile = new File(filepath);  
    124                     XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile));  
    125                     options.setExtractor(new FileImageExtractor(imageFolderFile));  
    126                     options.URIResolver(new IURIResolver() {
    127                         public String resolve(String uri) {
    128                             //http://192.168.30.222:8010//uploadFile/....
    129                             return "http://"+IP+":"+PORT+"//uploadFile/"+strRanString +"/"+ uri;
    130                         }
    131                     });
    132                     
    133                     options.setIgnoreStylesIfUnused(false);  
    134                     options.setFragment(true);  
    135                       
    136                     // 3) 将 XWPFDocument转换成XHTML  
    137                     OutputStream out = new FileOutputStream(new File(filepath + htmlName));  
    138                     IXWPFConverter<XHTMLOptions> converter = XHTMLConverter.getInstance();
    139                     converter.convert(document,out, options);
    140                     //XHTMLConverter.getInstance().convert(document, out, options);  
    141                     System.out.println("html路径:"+"http://"+IP+":"+PORT+"//uploadFile/"+strRanString+htmlName);
    142                 } catch (Exception e) {
    143                     e.printStackTrace();
    144                 }
    145             
    146             } else {  
    147                 System.out.println("Enter only MS Office 2007+ files");  
    148             }  
    149         }  
    150     }  
    151 
    152      /**
    153      *功能说明:生成时间戳
    154      *创建人:zsq
    155      *创建时间:2019年12月7日 下午2:37:09
    156      *
    157      */
    158      public static String getRandomNum(){
    159          Date dt = new Date();
    160          SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");  
    161          String str=sdf.format(dt);
    162          return str;
    163      }
    164      
    165    } 

    二:Java实现将Pdf转换为html

      1: 引入依赖

     1 <dependency>
     2             <groupId>net.sf.cssbox</groupId>
     3             <artifactId>pdf2dom</artifactId>
     4             <version>1.7</version>
     5         </dependency> 
     6         <dependency>
     7             <groupId>org.apache.pdfbox</groupId>
     8             <artifactId>pdfbox</artifactId>
     9             <version>2.0.12</version>
    10         </dependency>
    11         <dependency>
    12             <groupId>org.apache.pdfbox</groupId>
    13             <artifactId>pdfbox-tools</artifactId>
    14             <version>2.0.12</version>
    15  </dependency>
    16         

    2:代码Demo

     1 public class PdfToHtml {
     2 
     3   /*
     4     pdf转换html
     5      */
     6     public void pdfToHtmlTest(String inPdfPath,String outputHtmlPath)  {
     7        // String outputPath = "C:\works\files\ZSQ保密知识测试题库.html";
     8     9        //try() 写在()里面会自动关闭流
    10         try{
    11             BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(outputHtmlPath)),"utf-8"));
    12             //加载PDF文档
    13             //PDDocument document = PDDocument.load(bytes);
    14             PDDocument document = PDDocument.load(new File(inPdfPath));
    15             PDFDomTree pdfDomTree = new PDFDomTree();
    16             pdfDomTree.writeText(document,out);
    17         } catch (Exception e) {
    18             e.printStackTrace();
    19         }
    20     }
    21 
    22     public static void main(String[] args) throws IOException {
    23         PdfToHtml ph=new PdfToHtml();
    24         String pdfPath="C:\works\files\武研中心行政考勤制度.pdf";
    25         String outputPath="C:\works\files\武研中心行政考勤制度.html";
    26         ph.pdfToHtmlTest(pdfPath,outputPath);
    27   }
    28 
    29 }

    三:Java实现将TXT转换为html

     1  /*
     2      * txt文档转html
     3        filePath:txt原文件路径
     4        htmlPosition:转化后生成的html路径
     5     */
     6     public static void txtToHtml(String filePath, String htmlPosition) {
     7         try {
     8             //String encoding = "GBK";
     9             File file = new File(filePath);
    10             if (file.isFile() && file.exists()) { // 判断文件是否存在
    11                 InputStreamReader read = new InputStreamReader(new FileInputStream(file), "GBK");
    12                 // 考虑到编码格式
    13                 BufferedReader bufferedReader = new BufferedReader(read);
    14                 // 写文件
    15                 FileOutputStream fos = new FileOutputStream(new File(htmlPosition));
    16                 OutputStreamWriter osw = new OutputStreamWriter(fos, "GBK");
    17                 BufferedWriter bw = new BufferedWriter(osw);
    18                 String lineTxt = null;
    19                 while ((lineTxt = bufferedReader.readLine()) != null) {
    20                     bw.write("&nbsp&nbsp&nbsp"+lineTxt + "</br>");
    21                 }
    22                 bw.close();
    23                 osw.close();
    24                 fos.close();
    25                 read.close();
    26             } else {
    27                 System.out.println("找不到指定的文件");
    28             }
    29         } catch (Exception e) {
    30             System.out.println("读取文件内容出错");
    31             e.printStackTrace();
    32         }
    33     }
  • 相关阅读:
    with一个对象,自动触发__enter__方法
    SQLAlchemy-Utils
    SQLAlchemy
    wtforms
    Python数据库连接池DBUtils(基于pymysql模块连接数据库)
    VMWare安装linux centos,安装中文输入法
    HttpServletRequest和ServletRequest的区别
    Java序列化
    mybatis循环取序列,值相同问题处理
    利用jdk将wsdl生成java代码
  • 原文地址:https://www.cnblogs.com/zhaosq/p/12069087.html
Copyright © 2011-2022 走看看