zoukankan      html  css  js  c++  java
  • java正则表达式去除html中所有的标签和特殊HTML字符(以&开头的)

    来源于:https://www.androiddev.net/java%E6%AD%A3%E5%88%99%E8%A1%A8%E8%BE%BE%E5%BC%8F%E5%8E%BB%E9%99%A4html%E4%B8%AD%E6%89%80%E6%9C%89%E7%9A%84%E6%A0%87%E7%AD%BE%E5%92%8C%E7%89%B9%E6%AE%8Ahtml%E5%AD%97%E7%AC%A6%EF%BC%88%E4%BB%A5/

    package com.comcons.utils;
    
    import java.io.BufferedReader;
    import java.io.FileNotFoundException;
    import java.io.FileReader;
    import java.io.IOException;
    import java.util.regex.Pattern;
    
    public class ReduceHtml2Text {
    
    /**
    * 删除Html标签
    * @param inputString
    * @return
    */
    public static String removeHtmlTag(String inputString) {
    if (inputString == null)
    return null;
    String htmlStr = inputString; // 含html标签的字符串
    String textStr = "";
    java.util.regex.Pattern p_script;
    java.util.regex.Matcher m_script;
    java.util.regex.Pattern p_style;
    java.util.regex.Matcher m_style;
    java.util.regex.Pattern p_html;
    java.util.regex.Matcher m_html;
    java.util.regex.Pattern p_special;
    java.util.regex.Matcher m_special;
    try {
    //定义script的正则表达式{或<script[^>]*?>[\s\S]*?<\/script>
    String regEx_script = "<[\s]*?script[^>]*?>[\s\S]*?<[\s]*?\/[\s]*?script[\s]*?>";
    //定义style的正则表达式{或<style[^>]*?>[\s\S]*?<\/style>
    String regEx_style = "<[\s]*?style[^>]*?>[\s\S]*?<[\s]*?\/[\s]*?style[\s]*?>";
    // 定义HTML标签的正则表达式
    String regEx_html = "<[^>]+>";
    // 定义一些特殊字符的正则表达式 如:&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
    String regEx_special = "\&[a-zA-Z]{1,10};";
    
    p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);
    m_script = p_script.matcher(htmlStr);
    htmlStr = m_script.replaceAll(""); // 过滤script标签
    p_style = Pattern.compile(regEx_style, Pattern.CASE_INSENSITIVE);
    m_style = p_style.matcher(htmlStr);
    htmlStr = m_style.replaceAll(""); // 过滤style标签
    p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);
    m_html = p_html.matcher(htmlStr);
    htmlStr = m_html.replaceAll(""); // 过滤html标签
    p_special = Pattern.compile(regEx_special, Pattern.CASE_INSENSITIVE);
    m_special = p_special.matcher(htmlStr);
    htmlStr = m_special.replaceAll(""); // 过滤特殊标签
    textStr = htmlStr;
    } catch (Exception e) {
    e.printStackTrace();
    }
    return textStr;// 返回文本字符串
    }
    
    /**
    * 测试用的main函数
    * @param args
    */
    public static void main(String[] args) {
    StringBuffer sb = new StringBuffer();
    try {
    FileReader fr = new FileReader("D:/test.html");
    BufferedReader br = new BufferedReader(fr);
    String s = "";
    while((s = br.readLine())!=null){
    sb.append(s);
    }
    } catch (FileNotFoundException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
    } catch (IOException e) {
    // TODO Auto-generated catch block
    e.printStackTrace();
    }
    String ssss = ReduceHtml2Text.removeHtmlTag(sb.toString());
    System.out.println(ssss);
    }
    }
  • 相关阅读:
    二维数组的查找问题
    将字符串编码成数值,求数值最大和问题(今日头条笔试题)
    链表的倒序打印
    求方程的近似解
    多边形构成问题(今日头条笔试题)
    各种语言数据类型大小
    luoguP1551 亲戚
    Codeforces 764 A-B
    Mixing Chemicals
    Day 8 of JZJX
  • 原文地址:https://www.cnblogs.com/ys-wuhan/p/6604861.html
Copyright © 2011-2022 走看看