using System; using System.Collections.Generic; using System.Web; using System.Web.UI; using System.Web.UI.WebControls; using HtmlAgilityPack; using System.Text; public partial class 抓取区县 : System.Web.UI.Page { protected void Page_Load(object sender, EventArgs e) { } protected void Button1_Click(object sender, EventArgs e) { string Url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/index.html"; HtmlWeb hw = new HtmlWeb(); hw.OverrideEncoding = System.Text.Encoding.GetEncoding("GB2312"); HtmlDocument doc = hw.Load(Url); doc.OptionReadEncoding = true; StringBuilder sb = new StringBuilder(); HtmlNodeCollection hrefList = doc.DocumentNode.SelectNodes(".//a[@href]"); if (hrefList != null) { foreach (HtmlNode href in hrefList) { int dep = 1;//深度 HtmlAttribute att = href.Attributes["href"]; if (att.Value.Contains("miibeian"))//去除备案 continue; string depurl = Url.Replace("index.html", att.Value); Response.Write(href.InnerText + "(" + att.Value.Replace(".html","").PadRight(12, '0') + ")<br/>");// + GetArea(depurl, dep, att.Value.Replace(".html", "").PadRight(12, '0')); // break; } } // GetArea("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/14/1401.html",2); } /// <summary> /// 递归读取方法 /// </summary> /// <param name="Url">下级地址</param> /// <param name="dep">下级深度</param> /// <param name="parentCode">上级代码</param> void GetArea(string Url, int dep, string parentCode) { //递归读取方法 HtmlWeb hw = new HtmlWeb(); hw.OverrideEncoding = System.Text.Encoding.GetEncoding("GB2312"); HtmlDocument doc = hw.Load(Url);//是你需要解析的url doc.OptionReadEncoding = true; dep++; StringBuilder sb = new StringBuilder(); HtmlNodeCollection hrefList = doc.DocumentNode.SelectNodes(".//tr[@class='" + classname(dep) + "']/td[last()]"); if (hrefList != null) { foreach (HtmlNode href in hrefList) { HtmlNodeCollection hrefNode = href.SelectNodes(".//a[@href]"); if (hrefNode != null)//链接存在 { HtmlAttribute att = hrefNode[0].Attributes["href"]; if (att.Value.Contains("miibeian"))//去除备案 continue; string codeNum = href.PreviousSibling.InnerText.PadRight(12, '0'); Response.Write(joinstr(dep) + href.InnerText + "(" + codeNum + ")<br/>"); string depurl = Url.Replace(parentCode.Substring(0, 2 * (dep-1)) + ".html", att.Value); if(dep<4)//到乡镇即可 GetArea(depurl, dep, codeNum); } else { Response.Write(joinstr(dep) + href.InnerText + "(" + href.PreviousSibling.InnerText.PadRight(12, '0') + ")<br/>"); } } } } /// <summary> /// 分级连接符 /// </summary> /// <param name="dep"></param> /// <returns></returns> string joinstr(int dep) { string str = ""; for (int i = 1; i < dep; i++) { str += "-----|"; } return str; } /// <summary> /// 不同级别不同的class名称 /// </summary> /// <param name="dep"></param> /// <returns></returns> string classname(int dep) { string str = ""; switch (dep) { case 2: str = "citytr"; break; case 3: str = "countytr"; break; case 4: str = "towntr"; break; default: break; }; return str; } }
以上是我写的完整代码,用的HtmlAgilityPack组件解析html,直接开源下载就可以。关键的数据都已经获得到 如:区号 和名称
你可以用来写入你的数据库即可。
效果如下: