zoukankan      html  css  js  c++  java
  • C#正则提取html图片等

    去除html标记,比较实用,分享给大家。

       ///   <summary>
       ///   去除HTML标记
       ///   </summary>
       ///   <param   name="Htmlstring">包括HTML的源码   </param>
       ///   <returns>已经去除后的文字</returns> 
       public   static   string   NoHTML(string   Htmlstring)
       {
        //删除脚本
       Htmlstring = Htmlstring.Replace(" ","");
       Htmlstring = Regex.Replace(Htmlstring,@"<script.*?</script>","",RegexOptions.IgnoreCase);
       Htmlstring = Regex.Replace(Htmlstring,@"<style.*?</style>","",RegexOptions.IgnoreCase);
       Htmlstring = Regex.Replace(Htmlstring,@"<.*?>","",RegexOptions.IgnoreCase);
       //删除HTML
       Htmlstring = Regex.Replace(Htmlstring,@"<(.[^>]*)>","",RegexOptions.IgnoreCase);
       Htmlstring = Regex.Replace(Htmlstring,@"([ ])[s]+","",RegexOptions.IgnoreCase);
       Htmlstring = Regex.Replace(Htmlstring,@"-->","",RegexOptions.IgnoreCase);
       Htmlstring = Regex.Replace(Htmlstring,@"<!--.*","",RegexOptions.IgnoreCase);
       Htmlstring = Regex.Replace(Htmlstring,@"&(quot|#34);",""",RegexOptions.IgnoreCase);
       Htmlstring = Regex.Replace(Htmlstring,@"&(amp|#38);","&",RegexOptions.IgnoreCase);
       Htmlstring = Regex.Replace(Htmlstring,@"&(lt|#60);","<",RegexOptions.IgnoreCase);
       Htmlstring = Regex.Replace(Htmlstring,@"&(gt|#62);",">",RegexOptions.IgnoreCase);
       Htmlstring = Regex.Replace(Htmlstring,@"&(nbsp|#160);","",RegexOptions.IgnoreCase);
       Htmlstring = Regex.Replace(Htmlstring,@"&(iexcl|#161);","xa1",RegexOptions.IgnoreCase);
       Htmlstring = Regex.Replace(Htmlstring,@"&(cent|#162);","xa2",RegexOptions.IgnoreCase);
       Htmlstring = Regex.Replace(Htmlstring,@"&(pound|#163);","xa3",RegexOptions.IgnoreCase);
       Htmlstring = Regex.Replace(Htmlstring,@"&(copy|#169);","xa9",RegexOptions.IgnoreCase);
       Htmlstring = Regex.Replace(Htmlstring,@"&#(d+);","",RegexOptions.IgnoreCase); 
       Htmlstring = Htmlstring.Replace("<","");
       Htmlstring = Htmlstring.Replace(">","");
       Htmlstring = Htmlstring.Replace(" ","");   
       Htmlstring=HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
       return Htmlstring;
       }

       /// <summary>
       /// 提取HTML代码中文字的C#函数
       /// </summary>
       public   static   string   StripHTML(string   strHtml)
       {
        string   []   aryReg   ={
               @"<script[^>]*?>.*?</script>",
               @"<(/s*)?!?((w+:)?w+)(w+(s*=?s*(([""'])(\[""'tbnr]|[^7])*?7|w+)|.{0})|s)*?(/s*)?>",
               @"([ ])[s]+",
               @"&(quot|#34);",
               @"&(amp|#38);",
               @"&(lt|#60);",
               @"&(gt|#62);",
               @"&(nbsp|#160);",
               @"&(iexcl|#161);",
               @"&(cent|#162);",
               @"&(pound|#163);",
               @"&(copy|#169);",
               @"&#(d+);",
               @"-->",
               @"<!--.* "
              };
        string   []   aryRep   =   {
                 "",
                 "",
                 "",
                 """,
                 "&",
                 "<",
                 ">",
                 "   ",
                 "xa1",//chr(161), 
                 "xa2",//chr(162), 
                 "xa3",//chr(163), 
                 "xa9",//chr(169), 
                 "",
                 " ",
                 ""
                };
        string   newReg   =aryReg[0];
        string   strOutput=strHtml;
        for(int   i   =   0;i<aryReg.Length;i++)
        {
         Regex   regex   =   new   Regex(aryReg[i],RegexOptions.IgnoreCase);
         strOutput   =   regex.Replace(strOutput,aryRep[i]);
        }
        strOutput.Replace("<","");
        strOutput.Replace(">","");
        strOutput.Replace(" ","");
        return   strOutput;
       }
      
      
        #region   取出文本中的图片地址 
        /**////   <summary>
        ///   取出文本中的图片地址
        ///   </summary>
        ///   <param   name="HTMLStr">HTMLStr</param> 
        public   static   string   GetImgUrl(string   HTMLStr)
        {
         string   str   =   string.Empty;
         string   sPattern   =   @"^<imgs+[^>]*>";
         Regex   r   =   new   Regex(@"<imgs+[^>]*s*srcs*=s*([']?)(?<url>S+)'?[^>]*>",
          RegexOptions.Compiled);
         Match   m   =   r.Match(HTMLStr.ToLower());
         if   (m.Success)
          str   =   m.Result("${url}");
         return   str;
        }
        #endregion

  • 相关阅读:
    wireshark安装
    高效使用搜索引擎
    MFC 网络编程 -- 总结
    MFC Grid control 2.27
    A Popup Progress Window
    Showing progress bar in a status bar pane
    Progress Control with Text
    Report List Controls
    第三方
    CBarChart柱形图类
  • 原文地址:https://www.cnblogs.com/easyteck/p/3481928.html
Copyright © 2011-2022 走看看