zoukankan      html  css  js  c++  java
  • HTMLScript 去除关键字

    以下是引用片段:
      -----
    /**/
    /// <summary>
    /// 去除HTML标记
    /// </summary>
    /// <param name="NoHTML">包括HTML的源码 </param>
    /// <returns>已经去除后的文字</returns>
    public static string NoHTML(string Htmlstring)
    {
    //删除脚本
    Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "",
    RegexOptions.IgnoreCase);
    //删除HTML
    Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "",
    RegexOptions.IgnoreCase);
    Htmlstring = Regex.Replace(Htmlstring, @"([
    ])[s]+", "",
    RegexOptions.IgnoreCase);
    Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
    Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);
    Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", """,
    RegexOptions.IgnoreCase);
    Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&",
    RegexOptions.IgnoreCase);
    Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<",
    RegexOptions.IgnoreCase);
    Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">",
    RegexOptions.IgnoreCase);
    Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ",
    RegexOptions.IgnoreCase);
    Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "xa1",
    RegexOptions.IgnoreCase);
    Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "xa2",
    RegexOptions.IgnoreCase);
    Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "xa3",
    RegexOptions.IgnoreCase);
    Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "xa9",
    RegexOptions.IgnoreCase);
    Htmlstring = Regex.Replace(Htmlstring, @"&#(d+);", "",
    RegexOptions.IgnoreCase);
    
    Htmlstring.Replace("<", "");
    Htmlstring.Replace(">", "");
    Htmlstring.Replace("
    ", "");
    Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
    
    return Htmlstring;
    }
    
    /**/ ///提取HTML代码中文字的C#函数
    /// <summary>
    /// 去除HTML标记
    /// </summary>
    /// <param name="strHtml">包括HTML的源码 </param>
    /// <returns>已经去除后的文字</returns>
    using System;
    using System.Text.RegularExpressions;
    public class StripHTMLTest
    {
    public static void Main()
    {
    string s = StripHTML(
    "<HTML><HEAD><TITLE>中国石龙信息平台</TITLE></HEAD><BODY>faddfs龙信息平台</BODY></HTML>");
    Console.WriteLine(s);
    }
    
    public static string StripHTML(string strHtml)
    {
    string[]aryReg =
    {
    @"<script[^>]*?>.*?</script>",
    
    @"<(/s*)?!?((w+:)?w+)(w+(s*=?s*(([""'])(\["
    "'tbnr]|[^7])*?7|w+)|.{0})|s)*?(/s*)?>", @"([
    ])[s]+", @
    "&(quot|#34);", @"&(amp|#38);", @"&(lt|#60);", @"&(gt|#62);", @
    "&(nbsp|#160);", @"&(iexcl|#161);", @"&(cent|#162);", @"&(pound|#163);",
    @"&(copy|#169);", @"&#(d+);", @"-->", @"<!--.*
    "
    };
    
    string[]aryRep =
    {
    "", "", "", """, "&", "<", ">", " ", "xa1", //chr(161),
    "xa2", //chr(162),
    "xa3", //chr(163),
    "xa9", //chr(169),
    "", "
    ", ""
    };
    
    string newReg = aryReg[0];
    string strOutput = strHtml;
    for (int i = 0; i < aryReg.Length; i++)
    {
    Regex regex = new Regex(aryReg[i], RegexOptions.IgnoreCase);
    strOutput = regex.Replace(strOutput, aryRep[i]);
    }
    strOutput.Replace("<", "");
    strOutput.Replace(">", "");
    strOutput.Replace("
    ", "");
    return strOutput;
    }
    }
    
    写一个静态方法移除HTML标签
    #region
    ///移除HTML标签
    /**/ /// <summary>
    /// 移除HTML标签
    /// </summary>
    /// <param name="HTMLStr">HTMLStr</param>
    public static string ParseTags(string HTMLStr)
    {
    return System.Text.RegularExpressions.Regex.Replace(HTMLStr, "<[^>]*>", "");
    }
    
    #endregion
    
    /// 取出文本中的图片地址
    #region
    /// 取出文本中的图片地址
    /**/ /// <summary>
    /// 取出文本中的图片地址
    /// </summary>
    /// <param name="HTMLStr">HTMLStr</param>
    public static string GetImgUrl(string HTMLStr)
    {
    string str = string.Empty;
    string sPattern = @"^<imgs+[^>]*>";
    Regex r = new Regex(@"<imgs+[^>]*s*srcs*=s*([']?)(?<url>S+)'?[^>]*>",
    RegexOptions.Compiled);
    Match m = r.Match(HTMLStr.ToLower());
    if (m.Success)
    str = m.Result("${url}");
    return str;
    }
    
    #endregion
    
    
    本文来源:IT传媒网
    原文链接:http://www.cniter.com/tech/asp.net/csharp/remove_html_tag_10806_1.html
  • 相关阅读:
    没用完的手机流量是否清零?讨论+吐槽
    南方周末:《系统》
    如何将Excel表批量赋值到ArcGIS属性表
    解决4K屏电脑显示问题
    坐标或测量值超出范围
    快速手工实现软件著作权源码60页制作
    SVN版本更新自动通知提醒
    1130不允许连接到MySql server
    Win10中SVN图标不显示的解决
    注意地理坐标系下的距离和面积计算
  • 原文地址:https://www.cnblogs.com/keyyang/p/4978306.html
Copyright © 2011-2022 走看看