zoukankan      html  css  js  c++  java
  • C#HTML 转文本及HTML内容提取 .

    //1、HTML直接转文本

    //使用方法
    HtmlToText convert = new HtmlToText();
    textBox2.Text = convert.Convert(textBox1.Text);


    //代码
    /// <summary>
    /// Converts HTML to plain text.
    /// </summary>
    class HtmlToText
    {
        // Static data tables
        protected static Dictionary<string, string> _tags;
        protected static HashSet<string> _ignoreTags;

        // Instance variables
        protected TextBuilder _text;
        protected string _html;
        protected int _pos;

        // Static constructor (one time only)
        static HtmlToText()
        {
            _tags = new Dictionary<string, string>();
            _tags.Add("address", "\n");
            _tags.Add("blockquote", "\n");
            _tags.Add("div", "\n");
            _tags.Add("dl", "\n");
            _tags.Add("fieldset", "\n");
            _tags.Add("form", "\n");
            _tags.Add("h1", "\n");
            _tags.Add("/h1", "\n");
            _tags.Add("h2", "\n");
            _tags.Add("/h2", "\n");
            _tags.Add("h3", "\n");
            _tags.Add("/h3", "\n");
            _tags.Add("h4", "\n");
            _tags.Add("/h4", "\n");
            _tags.Add("h5", "\n");
            _tags.Add("/h5", "\n");
            _tags.Add("h6", "\n");
            _tags.Add("/h6", "\n");
            _tags.Add("p", "\n");
            _tags.Add("/p", "\n");
            _tags.Add("table", "\n");
            _tags.Add("/table", "\n");
            _tags.Add("ul", "\n");
            _tags.Add("/ul", "\n");
            _tags.Add("ol", "\n");
            _tags.Add("/ol", "\n");
            _tags.Add("/li", "\n");
            _tags.Add("br", "\n");
            _tags.Add("/td", "\t");
            _tags.Add("/tr", "\n");
            _tags.Add("/pre", "\n");

            _ignoreTags = new HashSet<string>();
            _ignoreTags.Add("script");
            _ignoreTags.Add("noscript");
            _ignoreTags.Add("style");
            _ignoreTags.Add("object");
        }

        /// <summary>
        /// Converts the given HTML to plain text and returns the result.
        /// </summary>
        /// <param name="html">HTML to be converted</param>
        /// <returns>Resulting plain text</returns>
        public string Convert(string html)
        {
            // Initialize state variables
            _text = new TextBuilder();
            _html = html;
            _pos = 0;

            // Process input
            while (!EndOfText)
            {
                if (Peek() == '<')
                {
                    // HTML tag
                    bool selfClosing;
                    string tag = ParseTag(out selfClosing);

                    // Handle special tag cases
                    if (tag == "body")
                    {
                        // Discard content before <body>
                        _text.Clear();
                    }
                    else if (tag == "/body")
                    {
                        // Discard content after </body>
                        _pos = _html.Length;
                    }
                    else if (tag == "pre")
                    {
                        // Enter preformatted mode
                        _text.Preformatted = true;
                        EatWhitespaceToNextLine();
                    }
                    else if (tag == "/pre")
                    {
                        // Exit preformatted mode
                        _text.Preformatted = false;
                    }

                    string value;
                    if (_tags.TryGetValue(tag, out value))
                        _text.Write(value);

                    if (_ignoreTags.Contains(tag))
                        EatInnerContent(tag);
                }
                else if (Char.IsWhiteSpace(Peek()))
                {
                    // Whitespace (treat all as space)
                    _text.Write(_text.Preformatted ? Peek() : ' ');
                    MoveAhead();
                }
                else
                {
                    // Other text
                    _text.Write(Peek());
                    MoveAhead();
                }
            }
            // Return result
            return HttpUtility.HtmlDecode(_text.ToString());
        }

        // Eats all characters that are part of the current tag
        // and returns information about that tag
        protected string ParseTag(out bool selfClosing)
        {
            string tag = String.Empty;
            selfClosing = false;

            if (Peek() == '<')
            {
                MoveAhead();

                // Parse tag name
                EatWhitespace();
                int start = _pos;
                if (Peek() == '/')
                    MoveAhead();
                while (!EndOfText && !Char.IsWhiteSpace(Peek()) &&
                    Peek() != '/' && Peek() != '>')
                    MoveAhead();
                tag = _html.Substring(start, _pos - start).ToLower();

                // Parse rest of tag
                while (!EndOfText && Peek() != '>')
                {
                    if (Peek() == '"' || Peek() == '\'')
                        EatQuotedValue();
                    else
                    {
                        if (Peek() == '/')
                            selfClosing = true;
                        MoveAhead();
                    }
                }
                MoveAhead();
            }
            return tag;
        }

        // Consumes inner content from the current tag
        protected void EatInnerContent(string tag)
        {
            string endTag = "/" + tag;

            while (!EndOfText)
            {
                if (Peek() == '<')
                {
                    // Consume a tag
                    bool selfClosing;
                    if (ParseTag(out selfClosing) == endTag)
                        return;
                    // Use recursion to consume nested tags
                    if (!selfClosing && !tag.StartsWith("/"))
                        EatInnerContent(tag);
                }
                else MoveAhead();
            }
        }

        // Returns true if the current position is at the end of
        // the string
        protected bool EndOfText
        {
            get { return (_pos >= _html.Length); }
        }

        // Safely returns the character at the current position
        protected char Peek()
        {
            return (_pos < _html.Length) ? _html[_pos] : (char)0;
        }

        // Safely advances to current position to the next character
        protected void MoveAhead()
        {
            _pos = Math.Min(_pos + 1, _html.Length);
        }

        // Moves the current position to the next non-whitespace
        // character.
        protected void EatWhitespace()
        {
            while (Char.IsWhiteSpace(Peek()))
                MoveAhead();
        }

        // Moves the current position to the next non-whitespace
        // character or the start of the next line, whichever
        // comes first
        protected void EatWhitespaceToNextLine()
        {
            while (Char.IsWhiteSpace(Peek()))
            {
                char c = Peek();
                MoveAhead();
                if (c == '\n')
                    break;
            }
        }

        // Moves the current position past a quoted value
        protected void EatQuotedValue()
        {
            char c = Peek();
            if (c == '"' || c == '\'')
            {
                // Opening quote
                MoveAhead();
                // Find end of value
                int start = _pos;
                _pos = _html.IndexOfAny(new char[] { c, '\r', '\n' }, _pos);
                if (_pos < 0)
                    _pos = _html.Length;
                else
                    MoveAhead();    // Closing quote
            }
        }

        /// <summary>
        /// A StringBuilder class that helps eliminate excess whitespace.
        /// </summary>
        protected class TextBuilder
        {
            private StringBuilder _text;
            private StringBuilder _currLine;
            private int _emptyLines;
            private bool _preformatted;

            // Construction
            public TextBuilder()
            {
                _text = new StringBuilder();
                _currLine = new StringBuilder();
                _emptyLines = 0;
                _preformatted = false;
            }

            /// <summary>
            /// Normally, extra whitespace characters are discarded.
            /// If this property is set to true, they are passed
            /// through unchanged.
            /// </summary>
            public bool Preformatted
            {
                get
                {
                    return _preformatted;
                }
                set
                {
                    if (value)
                    {
                        // Clear line buffer if changing to
                        // preformatted mode
                        if (_currLine.Length > 0)
                            FlushCurrLine();
                        _emptyLines = 0;
                    }
                    _preformatted = value;
                }
            }

            /// <summary>
            /// Clears all current text.
            /// </summary>
            public void Clear()
            {
                _text.Length = 0;
                _currLine.Length = 0;
                _emptyLines = 0;
            }

            /// <summary>
            /// Writes the given string to the output buffer.
            /// </summary>
            /// <param name="s"></param>
            public void Write(string s)
            {
                foreach (char c in s)
                    Write(c);
            }

            /// <summary>
            /// Writes the given character to the output buffer.
            /// </summary>
            /// <param name="c">Character to write</param>
            public void Write(char c)
            {
                if (_preformatted)
                {
                    // Write preformatted character
                    _text.Append(c);
                }
                else
                {
                    if (c == '\r')
                    {
                        // Ignore carriage returns. We'll process
                        // '\n' if it comes next
                    }
                    else if (c == '\n')
                    {
                        // Flush current line
                        FlushCurrLine();
                    }
                    else if (Char.IsWhiteSpace(c))
                    {
                        // Write single space character
                        int len = _currLine.Length;
                        if (len == 0 || !Char.IsWhiteSpace(_currLine[len - 1]))
                            _currLine.Append(' ');
                    }
                    else
                    {
                        // Add character to current line
                        _currLine.Append(c);
                    }
                }
            }

            // Appends the current line to output buffer
            protected void FlushCurrLine()
            {
                // Get current line
                string line = _currLine.ToString().Trim();

                // Determine if line contains non-space characters
                string tmp = line.Replace(" ", String.Empty);
                if (tmp.Length == 0)
                {
                    // An empty line
                    _emptyLines++;
                    if (_emptyLines < 2 && _text.Length > 0)
                        _text.AppendLine(line);
                }
                else
                {
                    // A non-empty line
                    _emptyLines = 0;
                    _text.AppendLine(line);
                }

                // Reset current line
                _currLine.Length = 0;
            }

            /// <summary>
            /// Returns the current output as a string.
            /// </summary>
            public override string ToString()
            {
                if (_currLine.Length > 0)
                    FlushCurrLine();
                return _text.ToString();
            }
        }
    }

    //2、提取html的正文 类
    using System;
     using System.Text;
     namespace HtmlStrip
     {
         class MainClass
         {
             public static void Main (string[] args)
             {
                 string str = "<div>abc</div><span>efg</span><br /><script>888</script><!--<PA>WW</PA-->oo";
                 //System.IO.StreamReader rd=new System.IO.StreamReader ("/home/lx/test.html");
                 //str=rd.ReadToEnd ();
                 HtmlParser t = new HtmlParser (str); //
                 t.KeepTag (new string[] { "br" }); //设置br标签不过虑
                 Console.Write (t.Text ());
             }
            
            
            
         }
         class HtmlParser
         {
             private string[] htmlcode; //把html转为数组形式用于分析
             private StringBuilder result = new StringBuilder ();  //输出的结果
             private int seek; //分析文本时候的指针位置
             private string[] keepTag;  //用于保存要保留的尖括号内容
             private bool _inTag;  //标记现在的指针是不是在尖括号内
             private bool needContent = true;  //是否要提取正文
             private string tagName;  //当前尖括号的名字
             private string[] specialTag = new string[] { "script", "style", "!--" };  //特殊的尖括号内容,一般这些标签的正文是不要的
            
             /// <summary>
             /// 当指针进入尖括号内,就会触发这个属性。这里主要逻辑是提取尖括号里的标签名字
             /// </summary>
             public bool inTag {
                 get { return _inTag; }
                 set {
                     _inTag = value;
                     if (!value)
                         return;
                     bool ok = true;
                     tagName = "";
                     while (ok) {
                         string word = read ();
                         if (word != " " && word != ">") {
                             tagName += word;
                         } else if (word == " " && tagName.Length > 0) {
                             ok = false;
                         } else if (word == ">") {
                             ok = false;
                             inTag = false;
                             seek -= 1;
                         }
                     }
                 }
             }
             /// <summary>
             /// 初始化类
             /// </summary>
             /// <param name="html">
             ///  要分析的html代码
             /// </param>
             public HtmlParser (string html)
             {
                 htmlcode = new string[html.Length];
                 for (int i = 0; i < html.Length; i++) {
                     htmlcode[i] = html[i].ToString ();
                 }
                 KeepTag (new string[] {  });
             }
             /// <summary>
             /// 设置要保存那些标签不要被过滤掉
             /// </summary>
             /// <param name="tags">
             ///
             /// </param>
             public void KeepTag (string[] tags)
             {
                 keepTag = tags;
             }
            
             /// <summary>
             ///
             /// </summary>
             /// <returns>
             /// 输出处理后的文本
             /// </returns>
             public string Text ()
             {
                 int startTag = 0;
                 int endTag = 0;
                 while (seek < htmlcode.Length) {
                     string word = read ();
                     if (word.ToLower () == "<") {
                         startTag = seek;
                         inTag = true;
                     } else if (word.ToLower () == ">") {
                         endTag = seek;
                         inTag = false;
                         if (iskeepTag (tagName.Replace ("/", ""))) {
                             for (int i = startTag - 1; i < endTag; i++) {
                                 result.Append (htmlcode[i].ToString ());
                             }
                         } else if (tagName.StartsWith ("!--")) {
                             bool ok = true;
                             while (ok) {
                                 if (read () == "-") {
                                     if (read () == "-") {
                                         if (read () == ">") {
                                             ok = false;
                                         } else {
                                             seek -= 1;
                                         }
                                     }
                                 }
                             }
                         } else {
                             foreach (string str in specialTag) {
                                 if (tagName == str) {
                                     needContent = false;
                                     break;
                                 } else
                                     needContent = true;
                             }
                         }
                     } else if (!inTag && needContent) {
                         result.Append (word);
                     }
                    
                 }
                 return result.ToString ();
             }
             /// <summary>
             /// 判断是否要保存这个标签
             /// </summary>
             /// <param name="tag">
             /// A <see cref="System.String"/>
             /// </param>
             /// <returns>
             /// A <see cref="System.Boolean"/>
             /// </returns>
             private bool iskeepTag (string tag)
             {
                 foreach (string ta in keepTag) {
                     if (tag.ToLower () == ta.ToLower ()) {
                         return true;
                     }
                 }
                 return false;
             }
             private string read ()
             {
                 return htmlcode[seek++];
             }
     
         }
     }
     

  • 相关阅读:
    tar命令,vi编辑器
    Linux命令、权限
    Color Transfer between Images code实现
    利用Eclipse使用Java OpenCV(Using OpenCV Java with Eclipse)
    Matrix Factorization SVD 矩阵分解
    ZOJ Problem Set
    Machine Learning
    ZOJ Problem Set
    ZOJ Problem Set
    ZOJ Problem Set
  • 原文地址:https://www.cnblogs.com/qanholas/p/3109669.html
Copyright © 2011-2022 走看看