zoukankan      html  css  js  c++  java
  • [C#]

    背景

    在做网页数据分析的时候,我们关注的部分是内容,可以过滤掉HTML标签、Javascript、CSS等代码。

    目标输入

    <b>Hello World.</b><br/><p><i>Is there anyone out there?</i><p>

    输出结果

    Hello World. Is there anyone out there?

    开发工具

    Html Agility Pack
    http://html-agility-pack.net/

    实现方案1:(过滤规则严谨,保留HTML版式,推荐使用!

    //small but important modification to class https://github.com/zzzprojects/html-agility-pack/blob/master/src/Samples/Html2Txt/HtmlConvert.cs
    public static class HtmlToText
    {
    
        public static string Convert(string path)
        {
            HtmlDocument doc = new HtmlDocument();
            doc.Load(path);
            return ConvertDoc(doc);
        }
    
        public static string ConvertHtml(string html)
        {
            HtmlDocument doc = new HtmlDocument();
            doc.LoadHtml(html);
            return ConvertDoc(doc);
        }
    
        public static string ConvertDoc (HtmlDocument doc)
        {
            using (StringWriter sw = new StringWriter())
            {
                ConvertTo(doc.DocumentNode, sw);
                sw.Flush();
                return sw.ToString();
            }
        }
    
        internal static void ConvertContentTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
        {
            foreach (HtmlNode subnode in node.ChildNodes)
            {
                ConvertTo(subnode, outText, textInfo);
            }
        }
        public static void ConvertTo(HtmlNode node, TextWriter outText)
        {
            ConvertTo(node, outText, new PreceedingDomTextInfo(false));
        }
        internal static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)
        {
            string html;
            switch (node.NodeType)
            {
                case HtmlNodeType.Comment:
                    // don't output comments
                    break;
                case HtmlNodeType.Document:
                    ConvertContentTo(node, outText, textInfo);
                    break;
                case HtmlNodeType.Text:
                    // script and style must not be output
                    string parentName = node.ParentNode.Name;
                    if ((parentName == "script") || (parentName == "style"))
                    {
                        break;
                    }
                    // get text
                    html = ((HtmlTextNode)node).Text;
                    // is it in fact a special closing node output as text?
                    if (HtmlNode.IsOverlappedClosingElement(html))
                    {
                        break;
                    }
                    // check the text is meaningful and not a bunch of whitespaces
                    if (html.Length == 0)
                    {
                        break;
                    }
                    if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace)
                    {
                        html= html.TrimStart();
                        if (html.Length == 0) { break; }
                        textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true;
                    }
                    outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"s{2,}", " ")));
                    if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - 1]))
                    {
                        outText.Write(' ');
                    }
                        break;
                case HtmlNodeType.Element:
                    string endElementString = null;
                    bool isInline;
                    bool skip = false;
                    int listIndex = 0;
                    switch (node.Name)
                    {
                        case "nav":
                            skip = true;
                            isInline = false;
                            break;
                        case "body":
                        case "section":
                        case "article":
                        case "aside":
                        case "h1":
                        case "h2":
                        case "header":
                        case "footer":
                        case "address":
                        case "main":
                        case "div":
                        case "p": // stylistic - adjust as you tend to use
                            if (textInfo.IsFirstTextOfDocWritten)
                            {
                                outText.Write("
    ");
                            }
                            endElementString = "
    ";
                            isInline = false;
                            break;
                        case "br":
                            outText.Write("
    ");
                            skip = true;
                            textInfo.WritePrecedingWhiteSpace = false;
                            isInline = true;
                            break;
                        case "a":
                            if (node.Attributes.Contains("href"))
                            {
                                string href = node.Attributes["href"].Value.Trim();
                                if (node.InnerText.IndexOf(href, StringComparison.InvariantCultureIgnoreCase)==-1)
                                {
                                    endElementString =  "<" + href + ">";
                                }  
                            }
                            isInline = true;
                            break;
                        case "li": 
                            if(textInfo.ListIndex>0)
                            {
                                outText.Write("
    {0}.	", textInfo.ListIndex++); 
                            }
                            else
                            {
                                outText.Write("
    *	"); //using '*' as bullet char, with tab after, but whatever you want eg "	->", if utf-8 0x2022
                            }
                            isInline = false;
                            break;
                        case "ol": 
                            listIndex = 1;
                            goto case "ul";
                        case "ul": //not handling nested lists any differently at this stage - that is getting close to rendering problems
                            endElementString = "
    ";
                            isInline = false;
                            break;
                        case "img": //inline-block in reality
                            if (node.Attributes.Contains("alt"))
                            {
                                outText.Write('[' + node.Attributes["alt"].Value);
                                endElementString = "]";
                            }
                            if (node.Attributes.Contains("src"))
                            {
                                outText.Write('<' + node.Attributes["src"].Value + '>');
                            }
                            isInline = true;
                            break;
                        default:
                            isInline = true;
                            break;
                    }
                    if (!skip && node.HasChildNodes)
                    {
                        ConvertContentTo(node, outText, isInline ? textInfo : new PreceedingDomTextInfo(textInfo.IsFirstTextOfDocWritten){ ListIndex = listIndex });
                    }
                    if (endElementString != null)
                    {
                        outText.Write(endElementString);
                    }
                    break;
            }
        }
    }
    internal class PreceedingDomTextInfo
    {
        public PreceedingDomTextInfo(BoolWrapper isFirstTextOfDocWritten)
        {
            IsFirstTextOfDocWritten = isFirstTextOfDocWritten;
        }
        public bool WritePrecedingWhiteSpace {get;set;}
        public bool LastCharWasSpace { get; set; }
        public readonly BoolWrapper IsFirstTextOfDocWritten;
        public int ListIndex { get; set; }
    }
    internal class BoolWrapper
    {
        public BoolWrapper() { }
        public bool Value { get; set; }
        public static implicit operator bool(BoolWrapper boolWrapper)
        {
            return boolWrapper.Value;
        }
        public static implicit operator BoolWrapper(bool boolWrapper)
        {
            return new BoolWrapper{ Value = boolWrapper };
        }
    }

    实现方案2:过滤规则不严谨,适用于结构简单的HTML)

    public static string StripHTML(string HTMLText, bool decode = true)
    {
        Regex reg = new Regex("<[^>]+>", RegexOptions.IgnoreCase);
        var stripped = reg.Replace(HTMLText, "");
        return decode ? HttpUtility.HtmlDecode(stripped) : stripped;
    }

    参考资料

    https://stackoverflow.com/a/25178738
    https://stackoverflow.com/a/732110

  • 相关阅读:
    transaction sql
    谈谈tempdb在系统中的重要作用
    Windows快捷键
    sp_addlinkedserver使用方法
    大型网站用户登录信息保存实现的探讨
    Proj EULibHarn Paper Reading: Graspan: A SingleMachine DiskBased Graph System for Interprocedural Static Analyses of LargeScale Systems Code
    Proj EULibHarn Paper Reading: Towards Efficient LargeScale Interprocedural Program Static Analysis on Distributed DataParallel Computation
    Proj EULibHarn Paper Reading: APICraft: Fuzz Driver Generation for Closedsource SDK Libraries
    Proj EULibHarn Paper Reading: Systemizing Interprocedural Static Analysis of LargeScale Systems Code with Graspan
    Proj EULibHarn Paper Reading: Chianina: An Evolving Graph System for Flow and ContextSensitive Analyses of Million Lines of C Code
  • 原文地址:https://www.cnblogs.com/jinzesudawei/p/8713497.html
Copyright © 2011-2022 走看看