zoukankan      html  css  js  c++  java
  • 使用HtmlParser使用心得

    最近因工作的需要,需要检查html那些不合理或则什么没有闭合。在网上找了很久都没有找到比较合适的工具。于是句试着搞搞HtmlParser。

    获取html的代码:

       string GetContentFromUrl(string url)
            {
                string content = string.Empty;
                try
                {
                    HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);
                    request.Method = "GET";
                    request.AllowAutoRedirect = true;
                    HttpWebResponse response = request.GetResponse() as HttpWebResponse;
                    using (Stream stream = response.GetResponseStream())
                    {
                        StringBuilder sb = new StringBuilder();
                        byte[] buffer = new byte[4096];
                        MemoryStream sr = new MemoryStream();
                        Encoding coding = Encoding.GetEncoding(response.CharacterSet);
                        int readLength = stream.Read(buffer, 0, buffer.Length);
                        while (readLength > 0)
                        {
                            sr.Write(buffer, 0, readLength);
                            string txt = coding.GetString(buffer, 0, readLength);
                            sb.Append(txt);
                            readLength = stream.Read(buffer, 0, buffer.Length);
                        }
                        content = sb.ToString();
                    }
                    response.Close();
                    request.Abort();
                }
                catch (Exception ex)
                {
                    content = ex.Message;
                }
                return content;
            }

    解析html代码,一下代码在网上都能找到的

      private void RecursionHtmlNode(TreeNode treeNode, INode htmlNode, bool siblingRequired)
            {
                if (htmlNode == null || treeNode == null) return;
    
                TreeNode current = treeNode;
                //current node
                if (htmlNode is ITag)
                {
                    ITag tag = (htmlNode as ITag);
                    if (!tag.IsEndTag())
                    {
                        string nodeString = tag.TagName + " ";
                        if (tag.Attributes != null && tag.Attributes.Count > 0)
                        {
                            StringBuilder sb = new StringBuilder();
                            foreach (string key in tag.Attributes.Keys)
                            {
                                if (key.Contains("<TAGNAME>"))
                                    continue;
                                if (tag.Attributes[key] != null)
                                    sb.Append(key + "=\"" + tag.Attributes[key].ToString() + "\"");
                            }
                      
                            nodeString += sb.ToString();
                        }
                        current = new TreeNode(nodeString);
                        treeNode.Nodes.Add(current);
                    }
                }
    
                //the children nodes
                if (htmlNode.Children != null && htmlNode.Children.Count > 0)
                {
                    this.RecursionHtmlNode(current, htmlNode.FirstChild, true);
                }
    
                //the sibling nodes
                if (siblingRequired)
                {
                    INode sibling = htmlNode.NextSibling;
                    while (sibling != null)
                    {
                        this.RecursionHtmlNode(treeNode, sibling, false);
                        sibling = sibling.NextSibling;
                    }
                }
            }
      void ParseHTml()
            {
                string content = this.txtContent.Text;
                if (string.IsNullOrEmpty(content))
                    return;
    
                Lexer lexer = new Lexer(content);
                Parser parser = new Parser(lexer);
                NodeList htmlNodes = parser.Parse(null);
                this.treeView1.Nodes.Clear();
                this.treeView1.Nodes.Add("root");
                TreeNode treeRoot = this.treeView1.Nodes[0];
                for (int i = 0; i < htmlNodes.Count; i++)
                {
                    this.RecursionHtmlNode(treeRoot, htmlNodes[i], false);
                }
    
            }

    运行结果如图:


    网上有关HtmlParser的源代码下载比较麻烦,我把该部分代码页放在此次demo中了,下载地址:http://download.csdn.net/detail/dz45693/4374572

  • 相关阅读:
    VSTO不能创建OFFICE 文档项目的原因
    vs2016 创建 vsto excel 文件项目的一个问题
    一个开发原则:永远不要返回NULL
    客户为什么习惯变更需求
    从实际项目中的一个改进细节谈程序的易用性优化
    第三方系统打开EAFC的实现
    功能间(两个form)数据交互的编程方法
    关于行军模式大批量数据的审批的实现
    程序的升级发布管理
    转:从如何判断浮点数是否等于0说起——浮点数的机器级表示 献给依然 if ( double i ==0.00)的菜鸟们
  • 原文地址:https://www.cnblogs.com/majiang/p/2574741.html
Copyright © 2011-2022 走看看