zoukankan      html  css  js  c++  java
  • C# Html Agility Pack

    using System;
    using HtmlAgilityPack;
    using System.IO;
    using System.Text;
    using System.Text.RegularExpressions;
    using System.Collections.Generic;
    
    namespace ConsoleApp
    {
        class Program
        {
            static string goText(HtmlNode _htmlnode, bool isSplit = true)
            {
                string str = "";
    
                try {
                    // 获取text内容
                    str = _htmlnode.InnerText;
    
                    // 消除多余的符号
                    str = Regex.Replace(str, "
    |
    |	| ", "").Trim();
    
                    // 切割字符串
                    if (isSplit && str.IndexOf("") >= 0) {
                        str = str.Split('')[1];
                    }
                }
                catch {
    
                }          
                
                return str;
            }
    
            static void Main(string[] args)
            {
                // 获取index.html的内容
                string basePath = AppDomain.CurrentDomain.BaseDirectory + "/index.html";
                string html = "";
                if (File.Exists(@basePath)) {
                    html = File.ReadAllText(@basePath, Encoding.Default);
                }
                
                // 开始计算耗时
                DateTime beforDT = System.DateTime.Now;
    
                // 使用HtmlAgilityPack解析它
                var htmlDoc = new HtmlDocument();
                htmlDoc.LoadHtml(html);
    
                // 报告编号
                var report_number = goText(htmlDoc.DocumentNode.SelectSingleNode("/html/body/div/div/table/tr[2]/td/table[1]/tbody/tr[2]/td[1]"));
                // 查询时间
                var query_time = goText(htmlDoc.DocumentNode.SelectSingleNode("/html/body/div/div/table/tr[2]/td/table[1]/tbody/tr[2]/td[2]"));
                // 报告时间
                var report_time = goText(htmlDoc.DocumentNode.SelectSingleNode("/html/body/div/div/table/tr[2]/td/table[1]/tbody/tr[2]/td[3]"));
                // 姓名
                var report_name = goText(htmlDoc.DocumentNode.SelectSingleNode("/html/body/div/div/table/tr[2]/td/table[2]/tbody/tr[1]/td[1]"));
                // 证件类型
                var report_type = goText(htmlDoc.DocumentNode.SelectSingleNode("/html/body/div/div/table/tr[2]/td/table[2]/tbody/tr[1]/td[2]"));
                // 证件号码
                var report_id = goText(htmlDoc.DocumentNode.SelectSingleNode("/html/body/div/div/table/tr[2]/td/table[2]/tbody/tr[1]/td[3]"));
                // 婚姻
                var report_marriage = goText(htmlDoc.DocumentNode.SelectSingleNode("/html/body/div/div/table/tr[2]/td/table[2]/tbody/tr[1]/td[4]"));
    
    
    
                // 表格
                var table_tr = htmlDoc.DocumentNode.SelectNodes("/html/body/div/div/table/tr[2]/td/table[4]/tr[3]/td/table/tbody/tr/td/table/tbody/tr");
                List<Table> list = new List<Table>();            
                // 遍历所有的tr
                foreach (var node in table_tr) {
                    // 跳过第一次遍历吧
                    if (node.NodeType == HtmlNodeType.Element) {
                        // 获取所有的Td
                        var tds = node.Elements("td");
                        Table tb = new Table();
                        int i = 0;
                        // 遍历所有的Td
                        foreach (var td in tds) {
                            if (td.NodeType == HtmlNodeType.Element) {
                                string text = goText(td, false);
                                // 使用比较蠢的方式赋值,自己想办法优化
                                switch (i) 
                                {
                                    case 0:
                                        tb.a = text;
                                        break;
                                    case 1:
                                        tb.b = text;
                                        break;
                                    case 2:
                                        tb.c = text;
                                        break;
                                    case 3:
                                        tb.d = text;
                                        break;
                                }
                            }
                            i++;
                        }
                        list.Add(tb);
                    }
                }
    
                // 删除第一个节点。我不需要表头
                list.RemoveAt(0);
                Console.Write(list);
    
                // 结算程序耗时
                DateTime afterDT = System.DateTime.Now;
                TimeSpan ts = afterDT.Subtract(beforDT);
                Console.WriteLine("DateTime总共花费{0}ms.", ts.TotalMilliseconds);
                Console.ReadLine();
            }
        }
    
        public class Table
        {
            /// <summary>
            /// a
            /// </summary>
            public string a { get; set; }
            /// <summary>
            /// b
            /// </summary>
            public string b { get; set; }
            /// <summary>
            /// c
            /// </summary>
            public string c { get; set; }
            /// <summary>
            /// c
            /// </summary>
            public string d { get; set; }
        }
    }
  • 相关阅读:
    grep
    [NOI2009]植物大战僵尸
    sed
    YY的GCD
    awk
    CF1100E
    cat
    tac
    [学习笔记]基数排序
    more
  • 原文地址:https://www.cnblogs.com/CyLee/p/8029337.html
Copyright © 2011-2022 走看看