1.最近在写爬虫的时候,有的数据是用HTML的<TABLE>披露的,披露的时候会包含rowspan和colspan,
下图是个简单的例子:
对应的HTML代码如下:
<table border="1"> <tr> <td>Column1</td> <td>Column2</td> <td>Column3</td> <td>Column4</td> <td>Column5</td> <td>Column6</td> </tr> <tr> <td rowspan=3>1</td> <td>2</td> <td>3</td> <td>4</td> <td>5</td> <td>6</td> </tr> <tr> <td>7</td> <td rowspan=2 colspan=3>8</td> <td>9</td> </tr> <tr> <td>10</td> <td>11</td> </tr> </table>
2.我们需要的数据应该是下面这个样子的,才比较方便处理
3.那么如何转换呢,这里我们需要引用HtmlAgilityPack.dll
代码如下:
using System; using System.Data; using System.Linq; using HtmlAgilityPack; namespace HtmlToDataTable { static class Program { /// <summary> /// 应用程序的主入口点。 /// </summary> [STAThread] static void Main() { const string hrml = "<table border=\"1\"><tr><td>Column1</td><td>Column2</td><td>Column3</td><td>Column4</td><td>Column5</td><td>Column6</td></tr><tr><td rowspan=3>1</td><td>2</td><td></td><td>4</td><td>5</td><td>6</td></tr><tr><td></td><td rowspan=2 colspan=3>7</td><td>9</td></tr><tr><td></td><td>8</td></tr></table>"; var dt = HtmlToDataTable(hrml); } public static DataTable HtmlToDataTable(string hrml) { const string nulltxt = "-yellow3gold-"; var dt = new DataTable(); var doc = new HtmlDocument(); doc.LoadHtml(hrml); var tList = doc.DocumentNode.SelectNodes("//table"); if (tList != null) { var table = tList[0]; var rows = table.SelectNodes("//tr"); if (rows != null) { var colCount = 0; foreach (var td in rows[0].ChildNodes.Where(m => m.OriginalName.ToLower() == "td")) { var attr = td.Attributes["colspan"]; var colspan = (attr != null) ? int.Parse(attr.Value) : 1; colCount = colCount + colspan; } var rowCount = rows.Count; var arr = new string[rowCount][]; for (var r = 0; r < rowCount; r++) { arr[r] = new string[colCount]; } //填充数据 for (var row = 0; row < rowCount; row++) { var tr = rows[row]; var cols = tr.ChildNodes.Where(m => m.OriginalName.ToLower() == "td").ToList(); for (var column = 0; column < cols.Count; column++) { var cAttr = cols[column].Attributes["colspan"]; var colspan = (cAttr != null) ? int.Parse(cAttr.Value) : 1; var rAttr = cols[column].Attributes["rowspan"]; var rowspan = (rAttr != null) ? int.Parse(rAttr.Value) : 1; var text = string.IsNullOrEmpty(cols[column].InnerText) ? nulltxt : cols[column].InnerText; var startColumn = 0; for (var i = 0; i < rowspan; i++) { for (var j = 0; j < colspan; j++) { var d = startColumn == 0 ? column : startColumn; if (string.IsNullOrEmpty(arr[row + i][d + j])) arr[row + i][d + j] = text; else { var t = column + j + 1; startColumn = t; while (true) { if (string.IsNullOrEmpty(arr[row][t])) { arr[row][t] = text; break; } t++; } } } } } } for (var i = 0; i < arr.Length; i++) { if (i == 0) { for (var j = 0; j < arr[i].Length; j++) { var columnTxt = arr[i][j] == nulltxt ? "Column" + j : arr[i][j]; dt.Columns.Add(columnTxt); } } else { var row = dt.NewRow(); for (var k = 0; k < arr[i].Length; k++) { var columnTxt = arr[i][k] == nulltxt ? "" : arr[i][k]; row[k] = columnTxt; } dt.Rows.Add(row); } } } } return dt; } } }
4.转换成DataTable入库就比较方便了。
代码未经严格测试,如有不当之处,敬请指出!