3种方法从Html中取文本

zoukankan html css js c++ java

3种方法从Html中取文本

public static string NoHTML(string Htmlstring)
    {
        //删除脚本
        Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
        //删除HTML
        Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
        Htmlstring = Regex.Replace(Htmlstring, @"", string.Empty, RegexOptions.IgnoreCase);
return result;
}

3,using System.Text.RegularExpressions;

/// <summary>
/// HtmlExtract 抽取html里面的文本信息
/// </summary>
public class HtmlExtract
{

        #region private attributes
        private string _strHtml;
        #endregion

        #region public mehtods
         public HtmlExtract(string inStrHtml)
        { _strHtml = inStrHtml;}

        public string ExtractText()
        {
            string result = _strHtml;
            result = RemoveComment(result);
            result = RemoveScript(result);
            result = RemoveStyle(result);
            result = RemoveTags(result);
            return result.Trim();
        }
        #endregion

     #region private methods
       private string RemoveComment(string input)
{
string result = input;
//remove comment
result = Regex.Replace(result, @"", string.Empty, RegexOptions.IgnoreCase);
return result;
}
       private string RemoveStyle(string input)
{
string result = input;
//remove all styles
result = Regex.Replace(result, @"<style[^>]*?>.*?</style>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
return result;
}
       private string RemoveScript(string input)
{
string result = input;
result = Regex.Replace(result, @"<script[^>]*?>.*?</script>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
result = Regex.Replace(result, @"<noscript[^>]*?>.*?</noscript>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
return result;
}
       private string RemoveTags(string input)
{
string result = input;
result = result.Replace(" ", " ");
result = result.Replace("'", "\"");
result = result.Replace("<", "<");
result = result.Replace(">", ">");
result = result.Replace("&", "&");
result = result.Replace("<br>", "\r\n");
result = Regex.Replace(result, @"<[\s\S]*?>", string.Empty, RegexOptions.IgnoreCase);
return result;
}
     #endregion
}

查看全文

相关阅读:
Flush输出表格内容
 [From Linux Toy] inxi– bash script to get system information
.NET中Mocking框架的对比
 Python中的对象和动态性 [菜鸟的理解，高手莫入]
《正见佛陀的证悟》读书摘记（1）
Quote for tomorrow
不抱怨就会死
 web deploy + windows server 2008 r2 64bit + iis 7.5
bootsnipp
[转载] A Beginner's Guide to HTTP Cache Headers

原文地址：https://www.cnblogs.com/glume/p/1997500.html