zoukankan      html  css  js  c++  java
  • asp.net如何去掉HTML标记

    ///   <summary>   
      
    ///   去除HTML标记   
      
    ///   </summary>   
      
    ///   <param   name="NoHTML">包括HTML的源码   </param>   
      
    ///   <returns>已经去除后的文字</returns>   

      public   static   string   NoHTML(string   Htmlstring)   
      
    {   
      
    //删除脚本   
      Htmlstring   =   Regex.Replace(Htmlstring,@"<script[^>]*?>.*?</script>","",RegexOptions.IgnoreCase);   
      
    //删除HTML   
      Htmlstring   =   Regex.Replace(Htmlstring,@"<(.[^>]*)>","",RegexOptions.IgnoreCase);   
      Htmlstring   
    =   Regex.Replace(Htmlstring,@"([\r\n])[\s]+","",RegexOptions.IgnoreCase);   
      Htmlstring   
    =   Regex.Replace(Htmlstring,@"-->","",RegexOptions.IgnoreCase);   
      Htmlstring   
    =   Regex.Replace(Htmlstring,@"<!--.*","",RegexOptions.IgnoreCase);   
        
      Htmlstring   
    =   Regex.Replace(Htmlstring,@"&(quot|#34);","\"",RegexOptions.IgnoreCase);   
      Htmlstring   =   Regex.Replace(Htmlstring,@"&(amp|#38);","&",RegexOptions.IgnoreCase);   
      Htmlstring   
    =   Regex.Replace(Htmlstring,@"&(lt|#60);","<",RegexOptions.IgnoreCase);   
      Htmlstring   
    =   Regex.Replace(Htmlstring,@"&(gt|#62);",">",RegexOptions.IgnoreCase);   
      Htmlstring   
    =   Regex.Replace(Htmlstring,@"&(nbsp|#160);","   ",RegexOptions.IgnoreCase);   
      Htmlstring   
    =   Regex.Replace(Htmlstring,@"&(iexcl|#161);","\xa1",RegexOptions.IgnoreCase);   
      Htmlstring   
    =   Regex.Replace(Htmlstring,@"&(cent|#162);","\xa2",RegexOptions.IgnoreCase);   
      Htmlstring   
    =   Regex.Replace(Htmlstring,@"&(pound|#163);","\xa3",RegexOptions.IgnoreCase);   
      Htmlstring   
    =   Regex.Replace(Htmlstring,@"&(copy|#169);","\xa9",RegexOptions.IgnoreCase);   
      Htmlstring   
    =   Regex.Replace(Htmlstring,   @"&#(\d+);","",RegexOptions.IgnoreCase);   
        
      Htmlstring.Replace(
    "<","");   
      Htmlstring.Replace(
    ">","");   
      Htmlstring.Replace(
    "\r\n","");   
      Htmlstring
    =HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();   
        
      
    return   Htmlstring;   
      }



    ///提取HTML代码中文字的C#函数     
      
    ///   <summary>   
      
    ///   去除HTML标记   
      
    ///   </summary>   
      
    ///   <param   name="strHtml">包括HTML的源码   </param>   
      
    ///   <returns>已经去除后的文字</returns>   

      using   System;   
      
    using   System.Text.RegularExpressions;   
      
    public   class   StripHTMLTest{   
          
    public   static   void   Main(){   
              
    string   s=StripHTML("<HTML><HEAD><TITLE>中国石龙信息平台</TITLE></HEAD><BODY>faddfs龙信息平台</BODY></HTML>");   
              Console.WriteLine(s);   
          }
       
        
          
    public   static   string   StripHTML(string   strHtml){   
              
    string   []   aryReg   ={   
                          
    @"<script[^>]*?>.*?</script>",   
        
                          
    @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",   
                          
    @"([\r\n])[\s]+",   
                          
    @"&(quot|#34);",   
                          
    @"&(amp|#38);",   
                          
    @"&(lt|#60);",   
                          
    @"&(gt|#62);",     
                          
    @"&(nbsp|#160);",     
                          
    @"&(iexcl|#161);",   
                          
    @"&(cent|#162);",   
                          
    @"&(pound|#163);",   
                          
    @"&(copy|#169);",   
                          
    @"&#(\d+);",   
                          
    @"-->",   
                          
    @"<!--.*\n"   
                        }
    ;   
        
              
    string   []   aryRep   =   {   
                            
    "",   
                            
    "",   
                            
    "",   
                            
    "\"",   
                            "&",   
                            
    "<",   
                            
    ">",   
                            
    "   ",   
                            
    "\xa1",//chr(161),   
                            "\xa2",//chr(162),   
                            "\xa3",//chr(163),   
                            "\xa9",//chr(169),   
                            "",   
                            
    "\r\n",   
                            
    ""   
                          }
    ;   
        
              
    string   newReg   =aryReg[0];   
              
    string   strOutput=strHtml;   
              
    for(int   i   =   0;i<aryReg.Length;i++){   
                  Regex   regex   
    =   new   Regex(aryReg[i],RegexOptions.IgnoreCase);   
                  strOutput   
    =   regex.Replace(strOutput,aryRep[i]);   
              }
       
              strOutput.Replace(
    "<","");   
              strOutput.Replace(
    ">","");   
              strOutput.Replace(
    "\r\n","");   
              
    return   strOutput;   
          }
       
      }


    写一个静态方法   
      
    移除HTML标签   
        
                      
    取出文本中的图片地址
  • 相关阅读:
    Linux磁盘分区实例演示
    浅谈Linux下的rpm
    You have 1 unapplied migration(s). Your project may not work properly until you apply the migrations for app(s): shopadmin. Run 'python manage.py migrate' to apply them.
    Xshell Linux常用命令
    OSError: mysql_config not found
    AttributeError: module 'datetime' has no attribute 'now'
    CentOS查看进程端口号以及kill操作
    nginx报错 nginx: [alert] kill(25903, 1) failed (3: No such process)
    3D 散点图的绘制
    关系数据库和非关系型数据
  • 原文地址:https://www.cnblogs.com/goody9807/p/961195.html
Copyright © 2011-2022 走看看