zoukankan      html  css  js  c++  java
  • 3种方法从Html中取文本

    public static string NoHTML(string Htmlstring)
        {
            //删除脚本
            Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
            //删除HTML
            Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
            Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);

            Htmlstring.Replace("<", "");
            Htmlstring.Replace(">", "");
            Htmlstring.Replace("\r\n", "");
            Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();

            return Htmlstring;
        }

    using System.Text.RegularExpressions;//需要引用

      // 利用正则表达式去掉"<"和">"之间的内容
      private string StripHT(string strHtml)
      {
       Regex regex=new Regex("<.+?>",RegexOptions.IgnoreCase);
       string strOutput=regex.Replace(strHtml,"");
       return strOutput;
      }

     

     

    2

    using System.Text.RegularExpressions;
    /// <summary>
    /// HtmlExtract 抽取html里面的文本信息
    /// </summary>
    public class HtmlExtract
    {
       
            #region private attributes
            private string _strHtml;
            #endregion
            #region public mehtods
             public HtmlExtract(string inStrHtml)
            { _strHtml = inStrHtml;}
            public string ExtractText()
            {
                string result = _strHtml;
                result = RemoveComment(result);
                result = RemoveScript(result);
                result = RemoveStyle(result);
                result = RemoveTags(result);
                return result.Trim();
            }
            #endregion
         #region private methods
           private string RemoveComment(string input)
    {
    string result = input;
    //remove comment
    result = Regex.Replace(result, @"<!--[^-]*-->", string.Empty, RegexOptions.IgnoreCase);
    return result;
    }

    3,using System.Text.RegularExpressions;

    /// <summary>
    /// HtmlExtract 抽取html里面的文本信息
    /// </summary>
    public class HtmlExtract
    {
       
            #region private attributes
            private string _strHtml;
            #endregion

            #region public mehtods
             public HtmlExtract(string inStrHtml)
            { _strHtml = inStrHtml;}

            public string ExtractText()
            {
                string result = _strHtml;
                result = RemoveComment(result);
                result = RemoveScript(result);
                result = RemoveStyle(result);
                result = RemoveTags(result);
                return result.Trim();
            }
            #endregion


         #region private methods
           private string RemoveComment(string input)
    {
    string result = input;
    //remove comment
    result = Regex.Replace(result, @"<!--[^-]*-->", string.Empty, RegexOptions.IgnoreCase);
    return result;
    }
           private string RemoveStyle(string input)
    {
    string result = input;
    //remove all styles
    result = Regex.Replace(result, @"<style[^>]*?>.*?</style>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
    return result;
    }
           private string RemoveScript(string input)
    {
    string result = input;
    result = Regex.Replace(result, @"<script[^>]*?>.*?</script>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
    result = Regex.Replace(result, @"<noscript[^>]*?>.*?</noscript>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);
    return result;
    }
           private string RemoveTags(string input)
    {
    string result = input;
    result = result.Replace(" ", " ");
    result = result.Replace("'", "\"");
    result = result.Replace("<", "<");
    result = result.Replace(">", ">");
    result = result.Replace("&", "&");
    result = result.Replace("<br>", "\r\n");
    result = Regex.Replace(result, @"<[\s\S]*?>", string.Empty, RegexOptions.IgnoreCase);
    return result;
    }
         #endregion
    }

     

  • 相关阅读:
    DC综合流程
    DC set_tcl脚本配置
    同步FIFO设计
    顺序脉冲 发生器
    状态机的写法
    verilog串并转换
    indexOf()
    jQuery 效果
    jQuery 事件
    jQuery css
  • 原文地址:https://www.cnblogs.com/glume/p/1997500.html
Copyright © 2011-2022 走看看