zoukankan      html  css  js  c++  java
  • HtmlAgilityPack Sample

    通过html Table获取到内部数据,并执行去重.

                    HtmlAgilityPack.HtmlWeb hw = new HtmlAgilityPack.HtmlWeb();
                    //加载本地文件  (之前是通过System.Net.Http.HttpClient post采集到的)
                    HtmlAgilityPack.HtmlDocument doc = hw.Load(dir + "2019-12-03.html");
                    //取rootNode
                    HtmlAgilityPack.HtmlNode rootNode = doc.DocumentNode;
                    //获取 Table/tbody
                    string xpath = "//*[@id="DDetail2"]/tbody";
                    HtmlAgilityPack.HtmlNode node = rootNode.SelectSingleNode(xpath);
                    //删除 tr之间的#Text子对象
                    foreach (var script in node.Descendants("#Text").ToArray())
                        script.Remove();
                    if (node.ChildNodes.Count>1)
                    {
                        List<dailyDetail> li = new List<dailyDetail>();
                        //node.ChildNodes.Count - 1 去除最后一个新建行
                        for (int i = 0; i < node.ChildNodes.Count - 1; i++)
                        {
                            //取子Node (相对 xpath)
                            var id = node.ChildNodes[i].SelectSingleNode($"td[1]/input[2]");
                            var text = node.ChildNodes[i].SelectSingleNode($"td[2]/input");
                            li.Add(new dailyDetail() { dailyDetailId= id.Attributes["value"].Value ,dailyContent= text.Attributes["value"].Value });
                        }
                        //找出重复值
                        var query = (from dd in li
                                     where
                                       dd.dailyContent != null
                                     group dd by new
                                     {
                                         dd.dailyContent
                                     } into g
                                     where g.Count() > 1
                                     select new
                                     {
                                         g.Key.dailyContent
                                     }).ToList();
    
    
                        foreach (var item in query)
                        {
                            Console.WriteLine($"重复值:{item.dailyContent}");
                            Console.WriteLine($"首个Id:{li.FirstOrDefault(q=>q.dailyContent==item.dailyContent)?.dailyDetailId}");
    
                        }
    
                    }
    

      

  • 相关阅读:
    lnmp环境搭建
    ffmpeg基础使用
    mongodb 副本集搭建
    二 利用pandas统计中国百亿富豪的信息
    1 mongodb安装及启动
    2 mongodb设置密码登录和创建库
    一 pandas读取excle数据
    rancher的使用
    redis主从配置
    redis安装和配置
  • 原文地址:https://www.cnblogs.com/honk/p/12883675.html
Copyright © 2011-2022 走看看