Html格式内容转Csv内容,包括table(重点在rowspan和colspan合并),p,div元素,table不能包含嵌套功能。
1 /// <summary> 2 /// Html格式内容转Csv内容包括table(重点在rowspan和colspan合并),p,div元素 3 /// </summary> 4 /// <param name="hrml"></param> 5 /// <returns></returns> 6 private string HtmlToCsv(string hrml) 7 { 8 HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); 9 doc.LoadHtml(hrml); 10 StringBuilder sbLines = new StringBuilder(); 11 HtmlAgilityPack.HtmlNodeCollection tList = doc.DocumentNode.SelectNodes("//table"); 12 if (tList != null) 13 { 14 foreach (HtmlAgilityPack.HtmlNode table in tList) 15 { 16 sbLines.AppendLine("#flag_table#,"); 17 HtmlAgilityPack.HtmlNodeCollection rows = table.SelectNodes("//tr"); 18 if (rows != null) 19 { 20 int colCount = 0; 21 StringBuilder sbTable = new StringBuilder(); 22 foreach (HtmlAgilityPack.HtmlNode td in rows[0].ChildNodes.Where(m => m.OriginalName.ToLower() == "td")) 23 { 24 HtmlAgilityPack.HtmlAttribute attr = td.Attributes["colspan"]; 25 int colspan = (attr != null) ? int.Parse(attr.Value) : 1; 26 colCount = colCount + colspan; 27 } 28 int rowCount = rows.Count; 29 30 string[][] arr = new string[rowCount][]; 31 for (int r = 0; r < rowCount; r++) 32 { 33 arr[r] = new string[colCount]; 34 } 35 36 //填充区域 37 for (int r = 0; r < rowCount; r++) 38 { 39 HtmlAgilityPack.HtmlNode tr = rows[r]; 40 List<HtmlAgilityPack.HtmlNode> cols = tr.ChildNodes.Where(m => m.OriginalName.ToLower() == "td").ToList(); 41 42 int colspan = 0; 43 int rowspan = 0; 44 for (int c = 0; c < cols.Count; c++) 45 { 46 HtmlAgilityPack.HtmlAttribute cAttr = cols[c].Attributes["colspan"]; 47 colspan = (cAttr != null) ? int.Parse(cAttr.Value) : 1; 48 HtmlAgilityPack.HtmlAttribute rAttr = cols[c].Attributes["rowspan"]; 49 rowspan = (rAttr != null) ? int.Parse(rAttr.Value) : 1; 50 string text = cols[c].InnerText.Replace(" ", "").Replace(",", ",").Replace(" ", "").Replace(" ", "").Trim(); 51 52 if (colspan == 1 && rowspan == 1) 53 { 54 continue; 55 } 56 57 bool isFirst = true; 58 int rFill = r + rowspan; 59 for (int ri = r; ri < rFill; ri++) 60 { 61 int cFill = c + colspan; 62 for (int ci = c; ci < cFill; ci++) 63 { 64 if (isFirst) 65 { 66 text = (text == string.Empty) ? " " : text; 67 arr[ri][ci] = text; 68 isFirst = false; 69 } 70 else 71 { 72 arr[ri][ci] = string.Empty; 73 } 74 } 75 } 76 } 77 } 78 79 //填充单元 80 for (int r = 0; r < rowCount; r++) 81 { 82 HtmlAgilityPack.HtmlNode tr = rows[r]; 83 List<HtmlAgilityPack.HtmlNode> cols = tr.ChildNodes.Where(m => m.OriginalName.ToLower() == "td").ToList(); 84 Queue<string> queue = new Queue<string>(); 85 for (int c = 0; c < cols.Count; c++) 86 { 87 string text = cols[c].InnerText.Replace(" ", "").Replace(",", ",").Replace(" ", "").Replace(" ", "").Trim(); 88 queue.Enqueue(text); 89 } 90 for (int c = 0; c < colCount; c++) 91 { 92 if (arr[r][c] == null) 93 { 94 string text = queue.Count > 0 ? queue.Dequeue() : string.Empty; 95 arr[r][c] = text; 96 } 97 else 98 { 99 if (arr[r][c] != string.Empty) 100 { 101 if (queue.Count > 0) 102 { 103 queue.Dequeue(); 104 } 105 } 106 } 107 } 108 } 109 110 //组装成cvs格式内容 111 foreach (string[] cols in arr) 112 { 113 foreach (string col in cols) 114 { 115 sbLines.Append(col + ","); 116 } 117 sbLines.AppendLine(","); 118 } 119 table.RemoveAll(); 120 } 121 } 122 } 123 124 HtmlAgilityPack.HtmlNodeCollection pList = doc.DocumentNode.SelectNodes("//p"); 125 if (pList != null) 126 { 127 sbLines.AppendLine("#flag_text#,"); 128 foreach (HtmlAgilityPack.HtmlNode p in pList) 129 { 130 string text = p.InnerText.Replace(" ", "").Replace(",", ",").Replace(" ", "").Replace(" ", "").Trim(); 131 text = GetTextByHtml(text); 132 if (!string.IsNullOrWhiteSpace(text)) 133 { 134 sbLines.Append(text + ","); 135 sbLines.AppendLine(","); 136 } 137 else 138 { 139 sbLines.AppendLine(","); 140 } 141 p.RemoveAll(); 142 } 143 } 144 145 HtmlAgilityPack.HtmlNodeCollection dList = doc.DocumentNode.SelectNodes("//div"); 146 if (pList != null) 147 { 148 sbLines.AppendLine("#flag_text#,"); 149 foreach (HtmlAgilityPack.HtmlNode div in pList) 150 { 151 string text = div.InnerText.Replace(" ", "").Replace(",", ",").Replace(" ", "").Replace(" ", "").Trim(); 152 text = GetTextByHtml(text); 153 if (!string.IsNullOrWhiteSpace(text)) 154 { 155 sbLines.Append(text + ","); 156 sbLines.AppendLine(","); 157 } 158 else 159 { 160 sbLines.AppendLine(","); 161 } 162 //div.RemoveAll(); 163 } 164 } 165 return sbLines.ToString(); 166 }
html:
csv:
url:http://www.cnblogs.com/dreamman/p/5343924.html