zoukankan      html  css  js  c++  java
  • C# Html格式内容转Csv内容包括table(重点在rowspan和colspan合并),p,div元素

    Html格式内容转Csv内容,包括table(重点在rowspan和colspan合并),p,div元素,table不能包含嵌套功能。

      1 /// <summary>
      2 /// Html格式内容转Csv内容包括table(重点在rowspan和colspan合并),p,div元素
      3 /// </summary>
      4 /// <param name="hrml"></param>
      5 /// <returns></returns>
      6 private string HtmlToCsv(string hrml)
      7 {
      8     HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
      9     doc.LoadHtml(hrml);
     10     StringBuilder sbLines = new StringBuilder();
     11     HtmlAgilityPack.HtmlNodeCollection tList = doc.DocumentNode.SelectNodes("//table");
     12     if (tList != null)
     13     {
     14         foreach (HtmlAgilityPack.HtmlNode table in tList)
     15         {
     16             sbLines.AppendLine("#flag_table#,");
     17             HtmlAgilityPack.HtmlNodeCollection rows = table.SelectNodes("//tr");
     18             if (rows != null)
     19             {
     20                 int colCount = 0;
     21                 StringBuilder sbTable = new StringBuilder();
     22                 foreach (HtmlAgilityPack.HtmlNode td in rows[0].ChildNodes.Where(m => m.OriginalName.ToLower() == "td"))
     23                 {
     24                     HtmlAgilityPack.HtmlAttribute attr = td.Attributes["colspan"];
     25                     int colspan = (attr != null) ? int.Parse(attr.Value) : 1;
     26                     colCount = colCount + colspan;
     27                 }
     28                 int rowCount = rows.Count;
     29 
     30                 string[][] arr = new string[rowCount][];
     31                 for (int r = 0; r < rowCount; r++)
     32                 {
     33                     arr[r] = new string[colCount];
     34                 }
     35 
     36                 //填充区域
     37                 for (int r = 0; r < rowCount; r++)
     38                 {
     39                     HtmlAgilityPack.HtmlNode tr = rows[r];
     40                     List<HtmlAgilityPack.HtmlNode> cols = tr.ChildNodes.Where(m => m.OriginalName.ToLower() == "td").ToList();
     41 
     42                     int colspan = 0;
     43                     int rowspan = 0;
     44                     for (int c = 0; c < cols.Count; c++)
     45                     {
     46                         HtmlAgilityPack.HtmlAttribute cAttr = cols[c].Attributes["colspan"];
     47                         colspan = (cAttr != null) ? int.Parse(cAttr.Value) : 1;
     48                         HtmlAgilityPack.HtmlAttribute rAttr = cols[c].Attributes["rowspan"];
     49                         rowspan = (rAttr != null) ? int.Parse(rAttr.Value) : 1;
     50                         string text = cols[c].InnerText.Replace("&nbsp;", "").Replace(",", "").Replace("
    ", "").Replace("
    ", "").Trim();
     51 
     52                         if (colspan == 1 && rowspan == 1)
     53                         {
     54                             continue;
     55                         }
     56 
     57                         bool isFirst = true;
     58                         int rFill = r + rowspan;
     59                         for (int ri = r; ri < rFill; ri++)
     60                         {
     61                             int cFill = c + colspan;
     62                             for (int ci = c; ci < cFill; ci++)
     63                             {
     64                                 if (isFirst)
     65                                 {
     66                                     text = (text == string.Empty) ? " " : text;
     67                                     arr[ri][ci] = text;
     68                                     isFirst = false;
     69                                 }
     70                                 else
     71                                 {
     72                                     arr[ri][ci] = string.Empty;
     73                                 }
     74                             }
     75                         }
     76                     }
     77                 }
     78 
     79                 //填充单元
     80                 for (int r = 0; r < rowCount; r++)
     81                 {
     82                     HtmlAgilityPack.HtmlNode tr = rows[r];
     83                     List<HtmlAgilityPack.HtmlNode> cols = tr.ChildNodes.Where(m => m.OriginalName.ToLower() == "td").ToList();
     84                     Queue<string> queue = new Queue<string>();
     85                     for (int c = 0; c < cols.Count; c++)
     86                     {
     87                         string text = cols[c].InnerText.Replace("&nbsp;", "").Replace(",", "").Replace("
    ", "").Replace("
    ", "").Trim();
     88                         queue.Enqueue(text);
     89                     }
     90                     for (int c = 0; c < colCount; c++)
     91                     {
     92                         if (arr[r][c] == null)
     93                         {
     94                             string text = queue.Count > 0 ? queue.Dequeue() : string.Empty;
     95                             arr[r][c] = text;
     96                         }
     97                         else
     98                         {
     99                             if (arr[r][c] != string.Empty)
    100                             {
    101                                 if (queue.Count > 0)
    102                                 {
    103                                     queue.Dequeue();
    104                                 }
    105                             }
    106                         }
    107                     }
    108                 }
    109 
    110                 //组装成cvs格式内容
    111                 foreach (string[] cols in arr)
    112                 {
    113                     foreach (string col in cols)
    114                     {
    115                         sbLines.Append(col + ",");
    116                     }
    117                     sbLines.AppendLine(",");
    118                 }
    119                 table.RemoveAll();
    120             }
    121         }
    122     }
    123 
    124     HtmlAgilityPack.HtmlNodeCollection pList = doc.DocumentNode.SelectNodes("//p");
    125     if (pList != null)
    126     {
    127         sbLines.AppendLine("#flag_text#,");
    128         foreach (HtmlAgilityPack.HtmlNode p in pList)
    129         {
    130             string text = p.InnerText.Replace("&nbsp;", "").Replace(",", "").Replace("
    ", "").Replace("
    ", "").Trim();
    131             text = GetTextByHtml(text);
    132             if (!string.IsNullOrWhiteSpace(text))
    133             {
    134                 sbLines.Append(text + ",");
    135                 sbLines.AppendLine(",");
    136             }
    137             else
    138             {
    139                 sbLines.AppendLine(",");
    140             }
    141             p.RemoveAll();
    142         }
    143     }
    144 
    145     HtmlAgilityPack.HtmlNodeCollection dList = doc.DocumentNode.SelectNodes("//div");
    146     if (pList != null)
    147     {
    148         sbLines.AppendLine("#flag_text#,");
    149         foreach (HtmlAgilityPack.HtmlNode div in pList)
    150         {
    151             string text = div.InnerText.Replace("&nbsp;", "").Replace(",", "").Replace("
    ", "").Replace("
    ", "").Trim();
    152             text = GetTextByHtml(text);
    153             if (!string.IsNullOrWhiteSpace(text))
    154             {
    155                 sbLines.Append(text + ",");
    156                 sbLines.AppendLine(",");
    157             }
    158             else
    159             {
    160                 sbLines.AppendLine(",");
    161             }
    162             //div.RemoveAll();
    163         }
    164     }
    165     return sbLines.ToString();
    166 }

    html: 

    csv:

    url:http://www.cnblogs.com/dreamman/p/5343924.html

  • 相关阅读:
    Oracle 定时查询数据插入新表中(job+存储过程)
    serialVersionUID的作用
    如何解决"The superclass "javax.servlet.http.HttpServlet" was not found on the Java Build Path"
    如何解决maven archetype加载太慢的方法
    Java中NIO、BIO、AIO相关概念及应用场景
    Sql多条件排序
    Oracle学习笔记—Oracle左连接、右连接、全外连接以及(+)号用法(转载)
    Oracle学习笔记—oracle体系架构及状态(nomount、mount和open)简介
    Oracle学习笔记—归档模式
    Oracle学习笔记—Db_name、Db_domain、Global_name、Service_name、Instance_name和Oracle_SID(转载)
  • 原文地址:https://www.cnblogs.com/dreamman/p/5343924.html
Copyright © 2011-2022 走看看