C# :
public string RemoveHTML(string html)
{
html = Regex.Replace(html, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
html = Regex.Replace(html, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
html = Regex.Replace(html, @"([
])[s]+", "", RegexOptions.IgnoreCase);
html = Regex.Replace(html, @"-->", "", RegexOptions.IgnoreCase);
html = Regex.Replace(html, @"<!--.*", "", RegexOptions.IgnoreCase);
html = Regex.Replace(html, @"&(quot|#34);", """, RegexOptions.IgnoreCase);
html = Regex.Replace(html, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
html = Regex.Replace(html, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
html = Regex.Replace(html, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
html = Regex.Replace(html, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
html = Regex.Replace(html, @"&(iexcl|#161);", "xa1", RegexOptions.IgnoreCase);
html = Regex.Replace(html, @"&(cent|#162);", "xa2", RegexOptions.IgnoreCase);
html = Regex.Replace(html, @"&(pound|#163);", "xa3", RegexOptions.IgnoreCase);
html = Regex.Replace(html, @"&(copy|#169);", "xa9", RegexOptions.IgnoreCase);
html = Regex.Replace(html, @"&#(d+);", "", RegexOptions.IgnoreCase);
html = Regex.Replace(html, @"<img[^>]*>;", "", RegexOptions.IgnoreCase);
html.Replace("<", "");
html.Replace(">", "");
html.Replace("
", "");
return html;
}
public static string[] GetHtmlImageUrlList(string sHtmlText)
{
// 定义正则表达式用来匹配 img 标签
Regex regImg = new Regex(@"<img[^<>]*?src[s
]*=[s
]*[""']?[s
]*(?<imgUrl>[^s
""'<>]*)[^<>]*?/?[s
]*>", RegexOptions.IgnoreCase);
// 搜索匹配的字符串
MatchCollection matches = regImg.Matches(sHtmlText);
int i = 0;
string[] sUrlList = new string[matches.Count];
// 取得匹配项列表
foreach (Match match in matches)
sUrlList[i++] = match.Groups["imgUrl"].Value;
return sUrlList;
}
js:
function getimgsrc(htmlstr) {
var reg = /<img.+?src=('|")?([^'"]+)('|")?(?:s+|>)/gim;
var arr = []; while (tem = reg.exec(htmlstr)) { arr.push(tem[2]); }
return arr;
}
function removeHTMLTag(str) {
str = str.replace(/</?[^>]*>/g, ''); //去除HTML tag
str = str.replace(/[ | ]*
/g, '
'); //去除行尾空白
//str = str.replace(/
[s| | ]*
/g,'
'); //去除多余空行
str = str.replace(/ /ig, ''); //去掉
return str;
}