zoukankan      html  css  js  c++  java
  • 用Regex去除HTML标记

    /// <summary>

    /// 去除HTML标记

    /// </summary>

    /// <param>包括HTML的源码 </param>

    /// <returns>已经去除后的文字</returns>

     public static string StripHTML(string strHtml)

     {

                string[] aryReg ={

              @"<script[^>]*?>.*?</script>",

    @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",

              @"([\r\n])[\s]+",

              @"&(quot|#34);",

              @"&(amp|#38);",

              @"&(lt|#60);",

              @"&(gt|#62);",

              @"&(nbsp|#160);",

              @"&(iexcl|#161);",

              @"&(cent|#162);",

              @"&(pound|#163);",

              @"&(copy|#169);",

              @"&#(\d+);",

              @"-->",

              @"<!--.*\n"

            

             };

     

                string[] aryRep = {

               "",

               "",

               "",

               "\"",

               "&",

               "<",

               ">",

               " ",

               "\xa1",//chr(161),

               "\xa2",//chr(162),

               "\xa3",//chr(163),

               "\xa9",//chr(169),

               "",

               "\r\n",

               ""

              };

     

                string newReg = aryReg[0];

                string strOutput = strHtml;

                for (int i = 0; i < aryReg.Length; i++)

                {

                    Regex regex = new Regex(aryReg[i], RegexOptions.IgnoreCase);

                    strOutput = regex.Replace(strOutput, aryRep[i]);

                }

                strOutput.Replace("<", "");

                strOutput.Replace(">", "");

                strOutput.Replace("\r\n", "");

                strOutput.Replace("&nbsp;","");

                return strOutput;

            }

  • 相关阅读:
    [20190415]10g下那些latch是共享的.txt
    [20190415]11g下那些latch是共享的.txt
    [20190409]pre_page_sga=true与连接缓慢的问题.txt
    [20190402]Library Cache mutex.txt
    scrapy简单使用方法
    PHP多进程系列笔(转)
    RdKafka文档翻译
    python判断字符串中是否包含子字符串
    python 逐行读取txt文件
    redis使用watch完成秒杀抢购功能(转)
  • 原文地址:https://www.cnblogs.com/cuiwenke/p/1688407.html
Copyright © 2011-2022 走看看