zoukankan      html  css  js  c++  java
  • 使用HtmlParser使用心得

    最近因工作的需要,需要检查html那些不合理或则什么没有闭合。在网上找了很久都没有找到比较合适的工具。于是句试着搞搞HtmlParser。

    获取html的代码:

       string GetContentFromUrl(string url)
            {
                string content = string.Empty;
                try
                {
                    HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);
                    request.Method = "GET";
                    request.AllowAutoRedirect = true;
                    HttpWebResponse response = request.GetResponse() as HttpWebResponse;
                    using (Stream stream = response.GetResponseStream())
                    {
                        StringBuilder sb = new StringBuilder();
                        byte[] buffer = new byte[4096];
                        MemoryStream sr = new MemoryStream();
                        Encoding coding = Encoding.GetEncoding(response.CharacterSet);
                        int readLength = stream.Read(buffer, 0, buffer.Length);
                        while (readLength > 0)
                        {
                            sr.Write(buffer, 0, readLength);
                            string txt = coding.GetString(buffer, 0, readLength);
                            sb.Append(txt);
                            readLength = stream.Read(buffer, 0, buffer.Length);
                        }
                        content = sb.ToString();
                    }
                    response.Close();
                    request.Abort();
                }
                catch (Exception ex)
                {
                    content = ex.Message;
                }
                return content;
            }

    解析html代码,一下代码在网上都能找到的

      private void RecursionHtmlNode(TreeNode treeNode, INode htmlNode, bool siblingRequired)
            {
                if (htmlNode == null || treeNode == null) return;
    
                TreeNode current = treeNode;
                //current node
                if (htmlNode is ITag)
                {
                    ITag tag = (htmlNode as ITag);
                    if (!tag.IsEndTag())
                    {
                        string nodeString = tag.TagName + " ";
                        if (tag.Attributes != null && tag.Attributes.Count > 0)
                        {
                            StringBuilder sb = new StringBuilder();
                            foreach (string key in tag.Attributes.Keys)
                            {
                                if (key.Contains("<TAGNAME>"))
                                    continue;
                                if (tag.Attributes[key] != null)
                                    sb.Append(key + "=\"" + tag.Attributes[key].ToString() + "\"");
                            }
                      
                            nodeString += sb.ToString();
                        }
                        current = new TreeNode(nodeString);
                        treeNode.Nodes.Add(current);
                    }
                }
    
                //the children nodes
                if (htmlNode.Children != null && htmlNode.Children.Count > 0)
                {
                    this.RecursionHtmlNode(current, htmlNode.FirstChild, true);
                }
    
                //the sibling nodes
                if (siblingRequired)
                {
                    INode sibling = htmlNode.NextSibling;
                    while (sibling != null)
                    {
                        this.RecursionHtmlNode(treeNode, sibling, false);
                        sibling = sibling.NextSibling;
                    }
                }
            }
      void ParseHTml()
            {
                string content = this.txtContent.Text;
                if (string.IsNullOrEmpty(content))
                    return;
    
                Lexer lexer = new Lexer(content);
                Parser parser = new Parser(lexer);
                NodeList htmlNodes = parser.Parse(null);
                this.treeView1.Nodes.Clear();
                this.treeView1.Nodes.Add("root");
                TreeNode treeRoot = this.treeView1.Nodes[0];
                for (int i = 0; i < htmlNodes.Count; i++)
                {
                    this.RecursionHtmlNode(treeRoot, htmlNodes[i], false);
                }
    
            }

    运行结果如图:


    网上有关HtmlParser的源代码下载比较麻烦,我把该部分代码页放在此次demo中了,下载地址:http://download.csdn.net/detail/dz45693/4374572

  • 相关阅读:
    git删除目录,且保留本地的
    gitpush 免密码
    git常用操作
    ubuntu安装Nodejs
    ubuntu如何配置samba
    用AI将png转成svg做字符图标教程
    windows server 2012设置远程连接断开后自动注销
    windows 2012执行计划任务错误:操作员或系统管理员拒绝了请求(0x800710E0)
    删除节点
    代理 XP”组件已作为此服务器安全配置的一部分被关闭。系统管理员可以使用 sp_configure 来启用“代理 XP”。
  • 原文地址:https://www.cnblogs.com/majiang/p/2574741.html
Copyright © 2011-2022 走看看