zoukankan      html  css  js  c++  java
  • 提取HTML代码中文字的C#函数

    /// <summary>
      /// 去除HTML标记
      /// </summary>
      /// <param name="strHtml">包括HTML的源码 </param>
      /// <returns>已经去除后的文字</returns>
      public static string StripHTML(string strHtml)
      {
       string [] aryReg ={
              @"<script[^>]*?>.*?</script>",

              @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",
              @"([\r\n])[\s]+",
              @"&(quot|#34);",
              @"&(amp|#38);",
              @"&(lt|#60);",
              @"&(gt|#62);",
              @"&(nbsp|#160);",
              @"&(iexcl|#161);",
              @"&(cent|#162);",
              @"&(pound|#163);",
              @"&(copy|#169);",
              @"&#(\d+);",
              @"-->",
              @"<!--.*\n"
             
             };

       string [] aryRep = {
               "",
               "",
               "",
               "\"",
               "&",
               "<",
               ">",
               " ",
               "\xa1",//chr(161),
               "\xa2",//chr(162),
               "\xa3",//chr(163),
               "\xa9",//chr(169),
               "",
               "\r\n",
               ""
              };

       string newReg =aryReg[0];
       string strOutput=strHtml;
       for(int i = 0;i<aryReg.Length;i++)
       {
        Regex regex = new Regex(aryReg[i],RegexOptions.IgnoreCase );
        strOutput = regex.Replace(strOutput,aryRep[i]);
       }

       strOutput.Replace("<","");
       strOutput.Replace(">","");
       strOutput.Replace("\r\n","");

       strOutput.Replace("\n","");


       return strOutput;
      }

    参考

    http://www.webjx.com/htmldata/2005-06-08/1118182315.html

  • 相关阅读:
    android自动登录
    【199】ArcGIS 添加自定义工具到工具箱
    【198】Synergy
    【197】PowerShell 通过 FTP 下载文件
    【196】Dell 移动工作站系统安装方法
    php如何同时连接多个数据库
    FreeRTOS学习笔记——任务间使用队列同步数据
    牛腩新闻发布系统之发布
    Linux散列表(二)——宏
    Excel导入数据库(三)——SqlBulkCopy
  • 原文地址:https://www.cnblogs.com/blosaa/p/2118052.html
Copyright © 2011-2022 走看看