zoukankan      html  css  js  c++  java
  • 提取HTML代码中文字的C#函数

    /// <summary>
      /// 去除HTML标记
      /// </summary>
      /// <param name="strHtml">包括HTML的源码 </param>
      /// <returns>已经去除后的文字</returns>
      public static string StripHTML(string strHtml)
      {
       string [] aryReg ={
              @"<script[^>]*?>.*?</script>",

              @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",
              @"([\r\n])[\s]+",
              @"&(quot|#34);",
              @"&(amp|#38);",
              @"&(lt|#60);",
              @"&(gt|#62);",
              @"&(nbsp|#160);",
              @"&(iexcl|#161);",
              @"&(cent|#162);",
              @"&(pound|#163);",
              @"&(copy|#169);",
              @"&#(\d+);",
              @"-->",
              @"<!--.*\n"
             
             };

       string [] aryRep = {
               "",
               "",
               "",
               "\"",
               "&",
               "<",
               ">",
               " ",
               "\xa1",//chr(161),
               "\xa2",//chr(162),
               "\xa3",//chr(163),
               "\xa9",//chr(169),
               "",
               "\r\n",
               ""
              };

       string newReg =aryReg[0];
       string strOutput=strHtml;
       for(int i = 0;i<aryReg.Length;i++)
       {
        Regex regex = new Regex(aryReg[i],RegexOptions.IgnoreCase );
        strOutput = regex.Replace(strOutput,aryRep[i]);
       }

       strOutput.Replace("<","");
       strOutput.Replace(">","");
       strOutput.Replace("\r\n","");

       strOutput.Replace("\n","");


       return strOutput;
      }

    参考

    http://www.webjx.com/htmldata/2005-06-08/1118182315.html

  • 相关阅读:
    LeetCode 275. H-Index II
    LeetCode 274. H-Index
    LeetCode Gray Code
    LeetCode 260. Single Number III
    LeetCode Word Pattern
    LeetCode Nim Game
    LeetCode 128. Longest Consecutive Sequence
    LeetCode 208. Implement Trie (Prefix Tree)
    LeetCode 130. Surrounded Regions
    LeetCode 200. Number of Islands
  • 原文地址:https://www.cnblogs.com/blosaa/p/2118052.html
Copyright © 2011-2022 走看看