最近在做我们学校微软创新之旅的专题网站,文章内容是从原来的新闻系统里提取的。原来的新闻系统的文本编辑是用的FCKeditor,用户只要直接输入和上传图片就可以生成相关的HTML代码并存入数据库,而我想要将发布的文章的图片提取出来放在桌面,那么就必须将图片地址从HTML的代码里提取出来,同时我还需要去掉HTML标签的文本放入RSS中供用户订阅,所以只能想个办法解决这些问题。
using System; using System.Data; using System.Configuration; using System.Web; using System.Web.Security; using System.Web.UI; using System.Web.UI.WebControls; using System.Web.UI.WebControls.WebParts; using System.Web.UI.HtmlControls; using System.Text.RegularExpressions; /// <summary> /// FiterHtml 的摘要说明 /// </summary> public class FiterHtml { public FiterHtml() { } /// <summary> /// 去除HTML标记 /// </summary> public static string NoHTML(string Htmlstring) { //删除脚本 Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase); //删除HTML Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"([/r/n])[/s]+", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "/"", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "/xa1", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "/xa2", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "/xa3", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "/xa9", RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, @"&#(/d+);", "", RegexOptions.IgnoreCase); Htmlstring.Replace("<", ""); Htmlstring.Replace(">", ""); Htmlstring.Replace("/r/n", ""); Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim(); return Htmlstring; } public static string StripHTML(string strHtml) { string[] aryReg = { @"<script[^>]*?>.*?</script>", @"<(///s*)?!?((/w+:)?/w+)(/w+(/s*=?/s*(([""'])(//[", @"'tbnr]|[^/7])*?/7|/w+)|.{0})|/s)*?(///s*)?>", @"([/r/n])[/s]+", @"&(quot|#34);", @"&(amp|#38);", @"&(lt|#60);", @"&(gt|#62);", @"&(nbsp|#160);", @"&(iexcl|#161);", @"&(cent|#162);", @"&(pound|#163);", @"&(copy|#169);", @"&#(/d+);", @"-->", @"<!--.*/n" }; string[] aryRep = { "", "", "", "/"", "&", "<", ">", " ", "/xa1", //chr(161), "/xa2", //chr(162), "/xa3", //chr(163), "/xa9", //chr(169), "", "/r/n", "" }; string newReg = aryReg[0]; string strOutput = strHtml; for (int i = 0; i < aryReg.Length; i++) { Regex regex = new Regex(aryReg[i], RegexOptions.IgnoreCase); strOutput = regex.Replace(strOutput, aryRep[i]); } strOutput.Replace("<", ""); strOutput.Replace(">", ""); strOutput.Replace("/r/n", ""); return strOutput; } /// <summary> /// 移除HTML标签 /// </summary> /// <param name="HTMLStr">HTMLStr</param> public static string ParseTags(string HTMLStr) { return System.Text.RegularExpressions.Regex.Replace(HTMLStr, "<[^>]*>", ""); } /// <summary> /// 取出文本中的图片地址 /// </summary> /// <param name="HTMLStr">HTMLStr</param> public static string GetImgUrl(string HTMLStr) { string str = string.Empty; string sPattern = @"^<img/s+[^>]*>"; Regex r = new Regex(@"<img/s+[^>]*/s*src/s*=/s*([']?)(?<url>/S+)'?[^>]*>", RegexOptions.Compiled); Match m = r.Match(HTMLStr.ToLower()); if (m.Success) str = m.Result("${url}"); return str; } }测试用例:
protected void Page_Load(object sender, EventArgs e) { string test = "例:<strong id=/"abc/">你好</strong><p><a href=/"http://green.njut.edu.cn/images/gslogo.png/" target=/"_blank/">绿荫网</a></p><IMG title=/"绿荫网/" src=/"http://121.9.206.74/Gift/face/2.gif/" border=0>H<EM>i,星烛网</EM>"; FiterHtml filter = new FiterHtml(); Response.Write(FiterHtml.NoHTML(test)); }