目前NBS上有2015-2018四个年度的代码信息,写一个控制台程序爬一下县级行政区下的代码。
使用HttpWebRequest+HttpWebResponse获取html,使用HtmlAgilityPack类库解析HTML。
使用POST请求,请求头带Cookie信息,否则会被反爬机制挡死,返回“请开启JavaScript并刷新该页”。
县级URL Request获取数据的同时记录Response的Cookie信息,在请求镇级数据时,请求头发送此cookie。
“省-地-县-乡 ”与“省-县(地)-乡” 的URL长度不同,根据长度判断URL正确性时需注意,也许还有其他可能,暂未发现。
主方法
1 class Program 2 { 3 static void Main(string[] args) 4 { 5 Console.ForegroundColor = ConsoleColor.Magenta; 6 Console.WriteLine(" ----获取县级行政区乡、村二级区划代码"); 7 Console.WriteLine("----数据年份有:"); 8 Console.ResetColor(); 9 Cursor.WriteAt("A、2018", 2, 0); 10 Cursor.WriteAt("B、2017", 12, 0); 11 Cursor.WriteAt("C、2016", 2, 1); 12 Cursor.WriteAt("D、2015", 12, 1); 13 Input: Console.ForegroundColor = ConsoleColor.Magenta; 14 Console.WriteLine(); 15 Console.WriteLine("----请输入一个年份代码(回车提交):"); 16 Console.ResetColor(); 17 char chr = Convert.ToChar( Console.ReadLine().ToLower()[0]); 18 if ((int)chr >= 97 &&(int)chr <= 100) 19 { 20 string year = string.Empty; 21 switch (chr) 22 { 23 case 'a': 24 year = "2018"; break; 25 case 'b': 26 year = "2017"; break; 27 case 'c': 28 year = "2016"; break; 29 default: 30 year = "2015"; break; 31 } 32 System.Diagnostics.Process.Start($"http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/{year}"); 33 Console.ForegroundColor = ConsoleColor.Magenta; 34 Console.WriteLine("浏览器已加载区划代码起始页,请进入县级行政单位页面,复制url,粘贴到下面(回车提交):"); 35 } 36 else 37 goto Input; 38 Console.ResetColor(); 39 string cityurl = Console.ReadLine(); 40 if (cityurl.Length != 66&& cityurl.Length!=71) 41 { 42 Console.ForegroundColor = ConsoleColor.Magenta; 43 Console.WriteLine("url有误,请确认是县级行政单位页面,重新复制链接,粘贴到下面:"); 44 Console.ResetColor(); 45 cityurl = Console.ReadLine(); 46 } 47 try 48 { 49 Console.ForegroundColor = ConsoleColor.Magenta; 50 Func<object, List<TownInfo>> func = new Func<object, List<TownInfo>>(GetTownInfos); 51 Task<List<TownInfo>> task = new Task<List<TownInfo>>(func, cityurl); 52 task.Start(); 53 task.Wait(); 54 if (task.Status == TaskStatus.RanToCompletion && task.Result.Count > 0) 55 { 56 57 List<VillageInfo> villageInfos = new List<VillageInfo>(); 58 foreach (var item in task.Result) 59 { 60 //把乡镇信息写入村级列表,实现乡镇信息输出 61 VillageInfo villageInfo_town = new VillageInfo(item.Code, "", item.Name); 62 villageInfos.Add(villageInfo_town); 63 Func<object, List<VillageInfo>> func1 = new Func<object, List<VillageInfo>>(GetVillageInfos); 64 Task<List<VillageInfo>> task1 = new Task<List<VillageInfo>>(func1, item.Href); 65 task1.Start(); 66 task1.Wait(); 67 if (task1.Status == TaskStatus.RanToCompletion) 68 { 69 villageInfos.AddRange(task1.Result); 70 } 71 } 72 foreach (var item1 in villageInfos) 73 { 74 Console.WriteLine($"{item1.Name.Trim()} {item1.Cls.Trim()} {item1.Code.Trim()}"); 75 } 76 } 77 else 78 { Console.WriteLine("乡镇列表获取失败!"); } 79 80 } 81 catch (Exception) 82 { 83 throw new Exception(""); 84 } 85 Console.ReadKey(); 86 } 87 static string cookies = "AD_RS_COOKIE=20082854; wzws_cid=453a2d88181321410de83ba7eedaba3a141eb61ee7488027b6ab07a66054605e99e886827afa72708ce170398ea2fdfeec55455a7c0be8e779694026255f2166"; 88 //获取乡镇级信息列表 89 static List<TownInfo> GetTownInfos(object cityurl) 90 { 91 List<TownInfo> townInfos = new List<TownInfo>(); 92 HttpGetHelper httpGetHelper = new HttpGetHelper() { Url =(string) cityurl, ContentType = "text/html; charset=gb2312", Encode = Encoding.GetEncoding(936),RequestMethod="post"}; 93 //HtmlAgilityPack类库解析HTML 94 HtmlDocument document = new HtmlDocument(); 95 document.LoadHtml(httpGetHelper.GetHtml(1,ref cookies)); 96 //string html = httpGetHelper.GetHtml(ref cookies); 97 //路径里"//"表示从根节点开始查找,两个斜杠‘//’表示查找所有childnodes;一个斜杠'/'表示只查找第一层的childnodes(即不查找grandchild);点斜杠"./"表示从当前结点而不是根结点开始查找 98 HtmlNodeCollection htmlNodes = document.DocumentNode.SelectNodes("//tr[@class='towntr']"); 99 foreach (var node in htmlNodes) 100 { 101 HtmlNodeCollection htmlNodes1 = node.SelectNodes("./td"); 102 HtmlNode htmlNodeHref = node.SelectSingleNode(".//a[@href]"); 103 HtmlAttribute htmlAttribute = htmlNodeHref.Attributes["href"]; 104 TownInfo townInfo = new TownInfo(htmlNodes1[0].InnerText, htmlNodes1[1].InnerText, 105 (cityurl as string).Substring(0, (cityurl as string).LastIndexOf('/') + 1) + htmlAttribute.Value); 106 townInfos.Add(townInfo); 107 } 108 return townInfos; 109 } 110 //获取村级信息列表 111 static List<VillageInfo> GetVillageInfos(object townurl) 112 { 113 List<VillageInfo> villageInfos = new List<VillageInfo>(); 114 HttpGetHelper httpGetHelper = new HttpGetHelper() { Url = (string)townurl, ContentType = "text/html; charset=gb2312", Encode = Encoding.GetEncoding(936), RequestMethod = "post"}; 115 HtmlDocument document = new HtmlDocument(); 116 document.LoadHtml(httpGetHelper.GetHtml(2,ref cookies)); 117 //string html = httpGetHelper.GetHtml(ref cookies); 118 HtmlNodeCollection htmlNodes = document.DocumentNode.SelectNodes("//tr[@class='villagetr']"); 119 foreach (var node in htmlNodes) 120 { 121 HtmlNodeCollection htmlNodes1 = node.SelectNodes(".//td"); 122 VillageInfo villageInfo = new VillageInfo(htmlNodes1[0].InnerText,htmlNodes1[1].InnerText,htmlNodes1[2].InnerText); 123 villageInfos.Add(villageInfo); 124 } 125 return villageInfos; 126 } 127 }
辅助类/结构
1 internal class Cursor 2 { 3 const int origRow = 3; 4 const int origCol = 0; 5 public static void WriteAt(string s, int c, int r) 6 { 7 Console.SetCursorPosition(origCol + c, origRow + r); 8 Console.Write(s); 9 } 10 } 11 //乡镇信息结构 编码、名称、超链 12 struct TownInfo 13 { 14 string code; 15 public string Code{ get { return code; } } 16 string name; 17 public string Name{get { return name; } } 18 string href; 19 public string Href { get { return href; } } 20 public TownInfo (string code,string name,string href) 21 { 22 this.code = code; 23 this.name = name; 24 this.href = href; 25 } 26 } 27 //村信息结构 编码、城乡划分类,名称 28 struct VillageInfo 29 { 30 string code; 31 public string Code{ get { return code; } } 32 string cls; 33 public string Cls{ get { return cls; } } 34 string name; 35 public string Name{ get { return name; } } 36 public VillageInfo(string code,string cls,string name) 37 { 38 this.code = code; 39 this.cls = cls; 40 this.name = name; 41 } 42 }
获取HTML
1 public class HttpGetHelper 2 { 3 string url = string.Empty; 4 public string Url 5 { 6 set { url = value; } 7 } 8 9 int timeOut=10*1000; 10 public int Timeout 11 { 12 set { timeOut = value; } 13 } 14 15 string contentType= "text/html;charset=utf-8"; 16 public string ContentType 17 { 18 set { contentType = value; } 19 } 20 21 string userAgent= "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36 "; 22 public string UserAgent 23 { 24 set { userAgent = value; } 25 } 26 27 Encoding encode=Encoding.UTF8; 28 public Encoding Encode 29 { 30 set { encode = value; } 31 } 32 string request_Method = "get"; 33 public string RequestMethod 34 { 35 set { request_Method = value; } 36 } 37 /// <summary> 38 /// get html content 39 /// </summary> 40 /// <param name="cls">town=1;village=2</param> 41 /// <param name="cookies">if cls=1 then ref cookies</param> 42 /// <returns></returns> 43 public string GetHtml(int cls,ref string cookies) 44 { 45 string html = string.Empty; 46 try 47 { 48 if (url!=string.Empty) 49 { 50 HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest; 51 request.Timeout = this.timeOut; 52 request.ContentType = this.contentType; 53 request.UserAgent = this.userAgent; 54 request.Headers.Add(HttpRequestHeader.Cookie, cookies); 55 request.Method = request_Method; 56 using (HttpWebResponse response =request.GetResponse()as HttpWebResponse) 57 { 58 if (response.StatusCode==HttpStatusCode.OK) 59 {//如果是县级url,则记录cookie 60 if (cls==1) 61 { 62 CookieCollection cookieCollection = response.Cookies; 63 foreach (Cookie item in cookieCollection) 64 { 65 cookies = item.Name + "=" + item.Value + ";"; 66 } 67 cookies.Remove(cookies.Length - 1); 68 } 69 70 using (StreamReader streamReader = new StreamReader(response.GetResponseStream(), encode)) 71 { 72 html = streamReader.ReadToEnd(); 73 streamReader.Close(); 74 } 75 } 76 } 77 } 78 } 79 catch (Exception) 80 { 81 throw new Exception($"GetHtml失败,url:{url}"); 82 } 83 return html; 84 } 85 }