zoukankan      html  css  js  c++  java
  • 3种方法从Html中取文本

    public static string NoHTML(string Htmlstring)
        {
            //删除脚本
            Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
            //删除HTML
            Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);

            Htmlstring.Replace("<", "");
            Htmlstring.Replace(">", "");
            Htmlstring.Replace("\r\n", "");
            Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();

            return Htmlstring;
        }

    using System.Text.RegularExpressions;//需要引用

      // 利用正则表达式去掉"<"和">"之间的内容
      private string StripHT(string strHtml)
      {
       Regex regex=new Regex("<.+?>",RegexOptions.IgnoreCase);
       string strOutput=regex.Replace(strHtml,"");
       return strOutput;
      }

     

     

    2

    using System.Text.RegularExpressions;
    /// <summary>
    /// HtmlExtract 抽取html里面的文本信息
    /// </summary>
    public class HtmlExtract
    {
       
            #region private attributes
            private string _strHtml;
            #endregion
            #region public mehtods
             public HtmlExtract(string inStrHtml)
            { _strHtml = inStrHtml;}
            public string ExtractText()
            {
                string result = _strHtml;
                result = RemoveComment(result);
                result = RemoveScript(result);
                result = RemoveStyle(result);
                result = RemoveTags(result);
                return result.Trim();
            }
            #endregion
         #region private methods
           private string RemoveComment(string input)
    {
    string result = input;
    //remove comment
    result = Regex.Replace(result, @"<!--[^-]*-->", string.Empty, RegexOptions.IgnoreCase);
    return result;
    }

    3,using System.Text.RegularExpressions;

    /// <summary>
    /// HtmlExtract 抽取html里面的文本信息
    /// </summary>
    public class HtmlExtract
    {
       
            #region private attributes
            private string _strHtml;
            #endregion

            #region public mehtods
             public HtmlExtract(string inStrHtml)
            { _strHtml = inStrHtml;}

            public string ExtractText()
            {
                string result = _strHtml;
                result = RemoveComment(result);
                result = RemoveScript(result);
                result = RemoveStyle(result);
                result = RemoveTags(result);
                return result.Trim();
            }
            #endregion


         #region private methods
           private string RemoveComment(string input)
    {
    string result = input;
    //remove comment
    result = Regex.Replace(result, @"<!--[^-]*-->", string.Empty, RegexOptions.IgnoreCase);
    return result;
    }
           private string RemoveStyle(string input)
    {
    string result = input;
    //remove all styles
    result = Regex.Replace(result, @"<style[^>]*?>.*?</style>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
    return result;
    }
           private string RemoveScript(string input)
    {
    string result = input;
    result = Regex.Replace(result, @"<script[^>]*?>.*?</script>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
    result = Regex.Replace(result, @"<noscript[^>]*?>.*?</noscript>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
    return result;
    }
           private string RemoveTags(string input)
    {
    string result = input;
    result = result.Replace(" ", " ");
    result = result.Replace("'", "\"");
    result = result.Replace("<", "<");
    result = result.Replace(">", ">");
    result = result.Replace("&", "&");
    result = result.Replace("<br>", "\r\n");
    result = Regex.Replace(result, @"<[\s\S]*?>", string.Empty, RegexOptions.IgnoreCase);
    return result;
    }
         #endregion
    }

     

  • 相关阅读:
    javascript之数组的6种去重方法
    javascript之存储数据-cookie,localStorage,sessionStorage
    {less}
    javaScript的几个问题简答
    33 web页面-页面操作(鼠标、键盘操作)
    32 web页面-页面操作(元素等待、三大切换)
    31 Xpath复杂元素定位 find_element
    30 selenium (元素定位、webelement对象)
    29 HTML(定位标签的属性)
    28 selenium
  • 原文地址:https://www.cnblogs.com/glume/p/1997500.html
Copyright © 2011-2022 走看看