zoukankan      html  css  js  c++  java
  • 正则表达式,去除所有HTML标签

    string str = "<table><tr><td>sdasasdsdd</td></tr></table><br><p>sds</p><img id='img1' src='http://www.baidu.com/img/baidu_logo.gif' width='100' height='50' alt=''>aaassss<br><img src='http://www.baidu.com/img/baidu_logo.gif' width='100' height='50' alt=''> 说是道 ";
    
        protected void Page_Load(object sender, EventArgs e)
        {
    
            //string regexstr = @"<[^>]*>";    //去除所有的标签
    
            //@"<script[^>]*?>.*?</script>" //去除所有脚本,中间部分也删除
             
            // string regexstr = @"<img[^>]*>";   //去除图片的正则
    
           // string regexstr = @"<(?!br).*?>";   //去除所有标签,只剩br
    
            // string regexstr = @"<table[^>]*?>.*?</table>";   //去除table里面的所有内容
    
            string regexstr = @"<(?!img|br|p|/p).*?>";   //去除所有标签,只剩img,br,p
       
            str = Regex.Replace(str, regexstr, string.Empty, RegexOptions.IgnoreCase);
    
        }
    

      

    ASP.NET 去除所有HTML标记 < type="text/javascript">function StorePage(){d=document;t=d.selection?(d.selection.type!='None'?d.selection.createRange().text:''):(d.getSelection?d.getSelection():'');void(keyit=window.open('http://www.365key.com/storeit.aspx?t='+escape(d.title)+'&u='+escape(d.location.href)+'&c='+escape(t),'keyit','scrollbars=no,width=475,height=575,left=75,top=20,status=no,resizable=yes'));keyit.focus();}
    注意:需要先using  System.Text.RegularExpressions;  

    ///   <summary>   
      ///   去除HTML标记   
      ///   </summary>   
      ///   <param   name="NoHTML">包括HTML的源码   </param>   
      ///   <returns>已经去除后的文字</returns>   
      public   static   string   NoHTML(string   Htmlstring)   
      {   
      //删除脚本   
      Htmlstring   =   Regex.Replace(Htmlstring,@"<script[^>]*?>.*?</script>","",RegexOptions.IgnoreCase);   
      //删除HTML   
      Htmlstring   =   Regex.Replace(Htmlstring,@"<(.[^>]*)>","",RegexOptions.IgnoreCase);   
      Htmlstring   =   Regex.Replace(Htmlstring,@"([\r\n])[\s]+","",RegexOptions.IgnoreCase);   
      Htmlstring   =   Regex.Replace(Htmlstring,@"-->","",RegexOptions.IgnoreCase);   
      Htmlstring   =   Regex.Replace(Htmlstring,@"<!--.*","",RegexOptions.IgnoreCase);   
        
      Htmlstring   =   Regex.Replace(Htmlstring,@"&(quot|#34);","\"",RegexOptions.IgnoreCase);   
      Htmlstring   =   Regex.Replace(Htmlstring,@"&(amp|#38);","&",RegexOptions.IgnoreCase);   
      Htmlstring   =   Regex.Replace(Htmlstring,@"&(lt|#60);","<",RegexOptions.IgnoreCase);   
      Htmlstring   =   Regex.Replace(Htmlstring,@"&(gt|#62);",">",RegexOptions.IgnoreCase);   
      Htmlstring   =   Regex.Replace(Htmlstring,@"&(nbsp|#160);","   ",RegexOptions.IgnoreCase);   
      Htmlstring   =   Regex.Replace(Htmlstring,@"&(iexcl|#161);","\xa1",RegexOptions.IgnoreCase);   
      Htmlstring   =   Regex.Replace(Htmlstring,@"&(cent|#162);","\xa2",RegexOptions.IgnoreCase);   
      Htmlstring   =   Regex.Replace(Htmlstring,@"&(pound|#163);","\xa3",RegexOptions.IgnoreCase);   
      Htmlstring   =   Regex.Replace(Htmlstring,@"&(copy|#169);","\xa9",RegexOptions.IgnoreCase);   
      Htmlstring   =   Regex.Replace(Htmlstring,   @"&#(\d+);","",RegexOptions.IgnoreCase);   
        
      Htmlstring.Replace("<","");   
      Htmlstring.Replace(">","");   
      Htmlstring.Replace("\r\n","");   
      Htmlstring=HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();   
        
      return   Htmlstring;   
      }
    
    
    /**////提取HTML代码中文字的C#函数     
      ///   <summary>   
      ///   去除HTML标记   
      ///   </summary>   
      ///   <param   name="strHtml">包括HTML的源码   </param>   
      ///   <returns>已经去除后的文字</returns>   
      using   System;   
      using   System.Text.RegularExpressions;   
      public   class   StripHTMLTest{   
          public   static   void   Main(){   
              string   s=StripHTML("<HTML><HEAD><TITLE>中国石龙信息平台</TITLE></HEAD><BODY>faddfs龙信息平台</BODY></HTML>");   
              Console.WriteLine(s);   
          }   
        
          public   static   string   StripHTML(string   strHtml){   
              string   []   aryReg   ={   
                          @"<script[^>]*?>.*?</script>",   
        
                          @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(http://www.cnblogs.com/xchit/admin/file://[%22%22'tbnr]%7c[%5e/7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",   
                          @"([\r\n])[\s]+",   
                          @"&(quot|#34);",   
                          @"&(amp|#38);",   
                          @"&(lt|#60);",   
                          @"&(gt|#62);",     
                          @"&(nbsp|#160);",     
                          @"&(iexcl|#161);",   
                          @"&(cent|#162);",   
                          @"&(pound|#163);",   
                          @"&(copy|#169);",   
                          @"&#(\d+);",   
                          @"-->",   
                          @"<!--.*\n"   
                        };   
        
              string   []   aryRep   =   {   
                            "",   
                            "",   
                            "",   
                            "\"",   
                            "&",   
                            "<",   
                            ">",   
                            "   ",   
                            "\xa1",//chr(161),   
                            "\xa2",//chr(162),   
                            "\xa3",//chr(163),   
                            "\xa9",//chr(169),   
                            "",   
                            "\r\n",   
                            ""   
                          };   
        
              string   newReg   =aryReg[0];   
              string   strOutput=strHtml;   
              for(int   i   =   0;i<aryReg.Length;i++){   
                  Regex   regex   =   new   Regex(aryReg[i],RegexOptions.IgnoreCase);   
                  strOutput   =   regex.Replace(strOutput,aryRep[i]);   
              }   
              strOutput.Replace("<","");   
              strOutput.Replace(">","");   
              strOutput.Replace("\r\n","");   
              return   strOutput;   
          }   
      }
    
    //写一个静态方法   
      //移除HTML标签#region   移除HTML标签   
      /**////   <summary>   
      ///   移除HTML标签   
      ///   </summary>   
      ///   <param   name="HTMLStr">HTMLStr</param>   
      public   static   string     ParseTags(string   HTMLStr)   
      {   
      return   System.Text.RegularExpressions.Regex.Replace(HTMLStr,   "<[^>]*>",   "");     
      }   
        
      #endregion   
        
                      取出文本中的图片地址#region   取出文本中的图片地址   
                      /**////   <summary>   
                      ///   取出文本中的图片地址   
                      ///   </summary>   
                      ///   <param   name="HTMLStr">HTMLStr</param>   
                      public   static   string   GetImgUrl(string   HTMLStr)   
                      {   
                              string   str   =   string.Empty;   
                              string   sPattern   =   @"^<img\s+[^>]*>";   
                              Regex   r   =   new   Regex(@"<img\s+[^>]*\s*src\s*=\s*([']?)(?<url>\S+)'?[^>]*>",   
                                      RegexOptions.Compiled);   
                              Match   m   =   r.Match(HTMLStr.ToLower());   
                              if   (m.Success)   
                                      str   =   m.Result("${url}");   
                              return   str;   
                      }   
        
                      #endregion
    

      

  • 相关阅读:
    嵌入式成长轨迹36 【Zigbee项目】【单片机基础】【单片机SD卡】
    嵌入式成长轨迹31 【嵌入式学习阶段】【ARM环境调试】【UbuntuWin7 NAT联网】
    一个jQuery弹出层(tipsWindown)
    sql的left join 命令详解
    input javascript 之 onclick 大全
    php中调用用户自定义函数的方
    asp 正则表达式使用方法
    conn.execute的用法
    vbscript中的True和False
    JavaScript Cookie 的正确用法
  • 原文地址:https://www.cnblogs.com/xchit/p/1848661.html
Copyright © 2011-2022 走看看