zoukankan      html  css  js  c++  java
  • HTML解析类 ,让你不使用正则也能轻松获取HTML相关元素 C# .NET

    功能:

    1、轻松获取指元素HTML元素。

    2、可以根据属性标签进行筛选

    3、返回的都是Llist强类型无需转换

    用过XElement的都知道 用来解析XML非常的方便,但是对于HTML的格式多样化实在是没办法兼容。

    所以我就写了这么一个类似XElement的 XHTMLElement

    用法:

                string filePath = Server.MapPath("~/file/test.htm");
                //获取HTML代码
                string mailBody = FileHelper.FileToString(filePath);
    
                XHtmlElement xh = new XHtmlElement(mailBody);
    
                //获取body的子集a标签并且class="icon"
                var link = xh.Descendants("body").ChildDescendants("a").Where(c => c.Attributes.Any(a => a.Key == "class" && a.Value == "icon")).ToList();
    
                //获取带href的a元素
                var links = xh.Descendants("a").Where(c => c.Attributes.Any(a => a.Key == "href")).ToList();
                foreach (var r in links)
                {
                    Response.Write(r.Attributes.Single(c => c.Key == "href").Value); //出输href
                }
    
                //获取第一个img
                var img = xh.Descendants("img");
    
                //获取最近的第一个p元素以及与他同一级的其它p元素
                var ps = xh.Descendants("p");

    代码:

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Web;
    using System.Text;
    using System.Text.RegularExpressions;
    
    namespace SyntacticSugar
    {
        /// <summary>
        /// ** 描述:html解析类
        /// ** 创始时间:2015-4-23
        /// ** 修改时间:-
        /// ** 作者:sunkaixuan
        /// ** qq:610262374 欢迎交流,共同提高 ,命名语法等写的不好的地方欢迎大家的给出宝贵建议
        /// </summary>
        public class XHtmlElement
        {
            private string _html;
            public XHtmlElement(string html)
            {
                _html = html;
            }
    
            /// <summary>
            /// 获取最近的相同层级的HTML元素
            /// </summary>
            /// <param name="elementName">等于null为所有元素</param>
            /// <returns></returns>
            public List<HtmlInfo> Descendants(string elementName = null)
            {
                if (_html == null)
                {
                    throw new ArgumentNullException("html不能这空!");
                }
                var allList = RootDescendants(_html);
                var reval = allList.Where(c => elementName == null || c.TagName.ToLower() == elementName.ToLower()).ToList();
                if (reval == null || reval.Count == 0)
                {
                    reval = GetDescendantsSource(allList, elementName);
                }
                return reval;
            }
    
    
            /// <summary>
            /// 获取第一级元素
            /// </summary>
            /// <param name="elementName"></param>
            /// <returns></returns>
            public List<HtmlInfo> RootDescendants(string html = null)
            {
                /*
                 * 业务逻辑:
                             * 1、获取第一个html标签一直找结尾标签,如果在这个过程中遇到相同的标签收尾标签就要加1
                             * 2、第一个标签取到后继续第一步操作,找第2个元素 。。第N个元素
                 */
                if (html == null) html = _html;
                var firstTag = Regex.Match(html, "<.+?>");
    
                List<string> eleList = new List<string>();
                List<HtmlInfo> reval = new List<HtmlInfo>();
                GetElementsStringList(html, ref eleList);
                foreach (var r in eleList)
                {
                    HtmlInfo data = new HtmlInfo();
                    data.OldFullHtml = r;
                    data.SameLeveHtml = html;
                    data.TagName = Regex.Match(r, @"(?<=\s{1}|\<)[a-z,A-Z]+(?=\>|\s)", RegexOptions.IgnoreCase).Value;
                    data.InnerHtml = Regex.Match(r, @"(?<=\>).+(?=<)", RegexOptions.Singleline).Value;
                    var eleBegin = Regex.Match(r, "<.+?>").Value;
                    var attrList = Regex.Matches(eleBegin, @"[a-z,A-Z]+\="".+?""").Cast<Match>().Select(c => new { key = c.Value.Split('=').First(), value = c.Value.Split('=').Last().TrimEnd('"').TrimStart('"') }).ToList();
                    data.Attributes = new Dictionary<string, string>();
                    if (attrList != null && attrList.Count > 0)
                    {
                        foreach (var a in attrList)
                        {
                            data.Attributes.Add(a.key, a.value);
                        }
                    }
                    reval.Add(data);
                }
                return reval;
    
            }
    
    
    
    
    
            #region private
            private List<HtmlInfo> GetDescendantsSource(List<HtmlInfo> allList, string elementName)
            {
                foreach (var r in allList)
                {
                    if (r.InnerHtml == null || !r.InnerHtml.Contains("<")) continue;
                    var childList = RootDescendants(r.InnerHtml).Where(c => elementName == null || c.TagName.ToLower() == elementName.ToLower()).ToList();
                    if (childList == null || childList.Count == 0)
                    {
                        childList = GetDescendantsSource(RootDescendants(r.InnerHtml), elementName);
                        if (childList != null && childList.Count > 0)
                            return childList;
                    }
                    else
                    {
                        return childList;
                    }
                }
                return null;
            }
    
            private void GetElementsStringList(string html, ref List<string> eleList)
            {
                HtmlInfo info = new HtmlInfo();
                info.TagName = Regex.Match(html, @"(?<=\<\s{0,5}|\<)([a-z,A-Z]+|h\d{1})(?=\>|\s)", RegexOptions.IgnoreCase).Value;
                string currentTagBeginReg = @"<\s{0,10}" + info.TagName + @".*?>";//获取当前标签元素开始标签正则
                string currentTagEndReg = @"\<\/" + info.TagName + @"\>";//获取当前标签元素收尾标签正则
                if (string.IsNullOrEmpty(info.TagName)) return;
    
                string eleHtml = "";
                //情况1 <a/>
                //情况2 <a></a>
                //情况3 <a> 错误格式
                //情况4endif
                if (Regex.IsMatch(html, @"<\s{0,10}" + info.TagName + "[^<].*?/>"))//单标签
                {
                    eleHtml = Regex.Match(html, @"<\s{0,10}" + info.TagName + "[^<].*?/>").Value;
                }
                else if (!Regex.IsMatch(html, currentTagEndReg))//没有收尾
                {
                    if (Regex.IsMatch(html, @"\s{0,10}\<\!\-\-\[if"))
                    {
                        eleHtml = GetElementString(html, @"\s{0,10}\<\!\-\-\[if", @"\[endif\]\-\-\>", 1);
                    }
                    else
                    {
                        eleHtml = Regex.Match(html, currentTagBeginReg,RegexOptions.Singleline).Value;
                    }
                }
                else
                {
                    eleHtml = GetElementString(html, currentTagBeginReg, currentTagEndReg, 1);
                }
    
    
                try
                {
                    eleList.Add(eleHtml);
                    html = html.Replace(eleHtml, "");
                    html = Regex.Replace(html, @"<\!DOCTYPE.*?>", "");
                    if (!Regex.IsMatch(html, @"^\s*$"))
                    {
                        GetElementsStringList(html, ref eleList);
                    }
    
                }
                catch (Exception ex)
                {
                    throw new Exception("SORRY,您的HTML格式不能解析!!!");
    
                }
    
            }
    
            private string GetElementString(string html, string currentTagBeginReg, string currentTagEndReg, int i)
            {
    
                string newHtml = GetRegNextByNum(html, currentTagBeginReg, currentTagEndReg, i);
                var currentTagBeginMatches = Regex.Matches(newHtml, currentTagBeginReg, RegexOptions.Singleline).Cast<Match>().Select(c => c.Value).ToList();
                var currentTagEndMatches = Regex.Matches(newHtml, currentTagEndReg).Cast<Match>().Select(c => c.Value).ToList();
                if (currentTagBeginMatches.Count == currentTagEndMatches.Count)
                { //两个签标元素相等
                    return newHtml;
                }
                return GetElementString(html, currentTagBeginReg, currentTagEndReg, ++i);
            }
    
            private string GetRegNextByNum(string val, string currentTagBeginReg, string currentTagEndReg, int i)
            {
                return Regex.Match(val, currentTagBeginReg + @"((.*?)" + currentTagEndReg + "){" + i + "}?", RegexOptions.IgnoreCase | RegexOptions.Singleline).Value;
            }
            #endregion
    
    
    
        }
        public static class XHtmlElementExtendsion
        {
            /// <summary>
            /// 获取最近的相同层级的HTML元素
            /// </summary>
            /// <param name="elementName">等于null为所有元素</param>
            /// <returns></returns>
            public static List<HtmlInfo> Descendants(this  IEnumerable<HtmlInfo> htmlInfoList, string elementName = null)
            {
                var html = htmlInfoList.First().InnerHtml;
                XHtmlElement xhe = new XHtmlElement(html);
                return xhe.Descendants(elementName);
            }
            /// <summary>
            /// 获取下级元素
            /// </summary>
            /// <param name="elementName"></param>
            /// <returns></returns>
            public static List<HtmlInfo> ChildDescendants(this  IEnumerable<HtmlInfo> htmlInfoList, string elementName = null)
            {
                var html = htmlInfoList.First().InnerHtml;
                XHtmlElement xhe = new XHtmlElement(html);
                return xhe.RootDescendants(html).Where(c => elementName == null || c.TagName == elementName).ToList();
            }
    
            /// <summary>
            /// 获取父级
            /// </summary>
            /// <param name="htmlInfoList"></param>
            /// <returns></returns>
            public static List<HtmlInfo> ParentDescendant(this  IEnumerable<HtmlInfo> htmlInfoList,string fullHtml)
            {
                var saveLeveHtml = htmlInfoList.First().SameLeveHtml;
                string replaceGuid=Guid.NewGuid().ToString();
                fullHtml = fullHtml.Replace(saveLeveHtml,replaceGuid);
                var parentHtml = Regex.Match(fullHtml, @"<[^<]+?>[^<]*?" + replaceGuid + @".*?<\/.+?>").Value;
                parentHtml = parentHtml.Replace(replaceGuid, saveLeveHtml);
                XHtmlElement xhe = new XHtmlElement(parentHtml);
                return xhe.RootDescendants();
            }
        }
        /// <summary>
        /// html信息类
        /// </summary>
        public class HtmlInfo
        {
            /// <summary>
            /// 元素名
            /// </summary>
            public string TagName { get; set; }
            /// <summary>
            /// 元素属性
            /// </summary>
            public Dictionary<string, string> Attributes { get; set; }
            /// <summary>
            /// 元素内部html
            /// </summary>
            public string InnerHtml { get; set; }
    
            public string OldFullHtml { get; set; }
    
            public string SameLeveHtml { get; set; }
    
            /// <summary>
            /// 得到元素的html
            /// </summary>
            /// <returns></returns>
            public string FullHtml
            {
                get
                {
                    StringBuilder reval = new StringBuilder();
                    string attributesString = string.Empty;
                    if (Attributes != null && Attributes.Count > 0)
                    {
                        attributesString = string.Join(" ", Attributes.Select(c => string.Format("{0}=\"{1}\"", c.Key, c.Value)));
                    }
                    reval.AppendFormat("<{0} {2}>{1}</{0}>", TagName, InnerHtml, attributesString);
                    return reval.ToString();
                }
            }
        }
    }

    前台HTML:

    <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
    <html xmlns="http://www.w3.org/1999/xhtml">
    <head>
        <title></title>
    </head>
    <body>
        <a id="1">我是1</a> 
        <a id="2" class="icon">icon</a>
        <img />
    </body>
    </html>
  • 相关阅读:
    东汉末年,他们把「服务雪崩」玩到了极致(干货)
    我是一个秒杀请求,正在逃离这颗星球...
    《SpringCloud实战项目》系列目录
    《Java并发必知必会》系列
    微前端大赏二-singlespa实践
    redis传输协议规范(翻译)-上(Redis Protocol specification)
    oracle 查询数据库锁及锁处理
    golang string 转 int && int 转 string
    Json刚明白,怎么又出来个Bson?
    Linux中的buffer和cache到底是什么?今天终于明白了
  • 原文地址:https://www.cnblogs.com/sunkaixuan/p/4482121.html
Copyright © 2011-2022 走看看