zoukankan      html  css  js  c++  java
  • html转换text-分段落,实现富文本导入word的格式转换,标签过滤

    html转换text-分段落,实现富文本导入word的格式转换,标签过滤

    一、工具类 html2Text

    import javax.swing.text.html.HTMLEditorKit;
    import javax.swing.text.html.parser.ParserDelegator;
    import java.io.*;
    
    public class Html2Text extends HTMLEditorKit.ParserCallback {
    
        private static Html2Text html2Text = new Html2Text();
    
        StringBuffer s;
    
        public Html2Text() {
        }
    
        public void parse(String str) throws IOException {
            InputStream iin = new ByteArrayInputStream(str.getBytes());
            Reader in = new InputStreamReader(iin);
            s = new StringBuffer();
            ParserDelegator delegator = new ParserDelegator();
            // the third parameter is TRUE to ignore charset directive
            delegator.parse(in, this, Boolean.TRUE);
            iin.close();
            in.close();
        }
    
        public void handleEndOfLineString(String eol) {
           
        }
    
        /**
          *按标签分割过滤后执行
          */
        public void handleText(char[] text, int pos) {
            s.append(text);
        }
    
        public String getText() {
            return s.toString();
        }
    
        public static String getContent(String str) {
            try {
                html2Text.parse(str);
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            return html2Text.getText();
        }
    }
    

    二、分段实现

    /**
         * @Name        :getTextContentP
         * @Description :<富文本html转换text段落>
         * @Author      :gaogushenling
         * @Date        :2021/10/23 14:15
         * @Version     :1.0
         * @History     :<修改代码时说明>
         * @param       :xmlStr
         * @return      :List<String>
         */
        private List<String> getTextContentP(String xmlStr) {
            String s = xmlStr.replaceAll("div", "p");
            String[] ss = s.split("<p");
            List<String> textList = new ArrayList<>();
            for (String s1 : ss) {
    	  String s2 = Html2Text.getContent("<p "+s1);
    	  if (StringUtil.isNotEmpty(s2)){
                //textList.add(s2.replaceAll(""(?<=")(\\S+)(?=")"",""));
    	    textList.add(s2);
    	  }
            }
            if (textList.size() == 0) {
                textList.add("富文本文件是空的");
            }
            return textList;
        }
    

    调用

        List<String> textList = getTextContentP("富文本(html格式)");
    
  • 相关阅读:
    AMD and CMD are dead之KMD规范
    AMD and CMD are dead之js模块化黑魔法
    码农干货系列【20】--add gtTime to Promise.js
    ReactNative: 使用选择器组件PickerIOS组件
    ReactNative: 使用选择器组件Picker组件
    ReactNative: 使用模态组件Modal组件
    ReactNative: 使用列表组件ListView组件
    ReactNative: 使用键盘避免视图KeyboardAvoidingView组件实现键盘自适应
    ReactNative: 使用图片存储组件ImageStore组件
    ReactNative: 使用图片裁剪组件ImageEditor组件
  • 原文地址:https://www.cnblogs.com/gaogushenling/p/15443144.html
Copyright © 2011-2022 走看看