zoukankan      html  css  js  c++  java
  • 爬一下国家统计局行政区划代码C#

    目前NBS上有2015-2018四个年度的代码信息,写一个控制台程序爬一下县级行政区下的代码。

    使用HttpWebRequest+HttpWebResponse获取html,使用HtmlAgilityPack类库解析HTML。

    使用POST请求,请求头带Cookie信息,否则会被反爬机制挡死,返回“请开启JavaScript并刷新该页”。

    县级URL Request获取数据的同时记录Response的Cookie信息,在请求镇级数据时,请求头发送此cookie。

    省-地-县-乡 ”与“省-县(地)-乡” 的URL长度不同,根据长度判断URL正确性时需注意,也许还有其他可能,暂未发现。

    主方法

      1  class Program
      2     {
      3         static void Main(string[] args)
      4         {
      5             Console.ForegroundColor = ConsoleColor.Magenta;
      6             Console.WriteLine("
    ----获取县级行政区乡、村二级区划代码");
      7             Console.WriteLine("----数据年份有:");
      8             Console.ResetColor();
      9             Cursor.WriteAt("A、2018", 2, 0);
     10             Cursor.WriteAt("B、2017", 12, 0);
     11             Cursor.WriteAt("C、2016", 2, 1);
     12             Cursor.WriteAt("D、2015", 12, 1);
     13             Input: Console.ForegroundColor = ConsoleColor.Magenta;
     14             Console.WriteLine();
     15             Console.WriteLine("----请输入一个年份代码(回车提交):");
     16             Console.ResetColor();
     17             char chr = Convert.ToChar( Console.ReadLine().ToLower()[0]);
     18             if ((int)chr >= 97 &&(int)chr <= 100)
     19             {
     20                 string year = string.Empty;
     21                 switch (chr)
     22                 {
     23                     case 'a':
     24                         year = "2018"; break;
     25                     case 'b':
     26                         year = "2017"; break;
     27                     case 'c':
     28                         year = "2016"; break;
     29                     default:
     30                         year = "2015"; break;
     31                 }
     32                 System.Diagnostics.Process.Start($"http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/{year}");
     33                 Console.ForegroundColor = ConsoleColor.Magenta;
     34                 Console.WriteLine("浏览器已加载区划代码起始页,请进入县级行政单位页面,复制url,粘贴到下面(回车提交):");
     35             }
     36             else
     37                 goto Input;
     38             Console.ResetColor();
     39             string cityurl = Console.ReadLine();
     40             if (cityurl.Length != 66&& cityurl.Length!=71)
     41             {
     42                 Console.ForegroundColor = ConsoleColor.Magenta;
     43                 Console.WriteLine("url有误,请确认是县级行政单位页面,重新复制链接,粘贴到下面:");
     44                 Console.ResetColor();
     45                 cityurl = Console.ReadLine();
     46             }
     47             try
     48             {
     49                 Console.ForegroundColor = ConsoleColor.Magenta;
     50                 Func<object, List<TownInfo>> func = new Func<object, List<TownInfo>>(GetTownInfos);
     51                 Task<List<TownInfo>> task = new Task<List<TownInfo>>(func, cityurl);
     52                 task.Start();
     53                 task.Wait();
     54                 if (task.Status == TaskStatus.RanToCompletion && task.Result.Count > 0)
     55                 {
     56 
     57                     List<VillageInfo> villageInfos = new List<VillageInfo>();
     58                     foreach (var item in task.Result)
     59                     {
     60                         //把乡镇信息写入村级列表,实现乡镇信息输出
     61                         VillageInfo villageInfo_town = new VillageInfo(item.Code, "", item.Name);
     62                         villageInfos.Add(villageInfo_town);
     63                         Func<object, List<VillageInfo>> func1 = new Func<object, List<VillageInfo>>(GetVillageInfos);
     64                         Task<List<VillageInfo>> task1 = new Task<List<VillageInfo>>(func1, item.Href);
     65                         task1.Start();
     66                         task1.Wait();
     67                         if (task1.Status == TaskStatus.RanToCompletion)
     68                         {
     69                             villageInfos.AddRange(task1.Result);
     70                         }
     71                     }
     72                     foreach (var item1 in villageInfos)
     73                     {
     74                         Console.WriteLine($"{item1.Name.Trim()}	{item1.Cls.Trim()}	{item1.Code.Trim()}");
     75                     }
     76                 }
     77                 else
     78                 { Console.WriteLine("乡镇列表获取失败!"); }
     79 
     80             }
     81             catch (Exception)
     82             {
     83                 throw new Exception("");
     84             }
     85             Console.ReadKey();
     86         }
     87         static string cookies = "AD_RS_COOKIE=20082854; wzws_cid=453a2d88181321410de83ba7eedaba3a141eb61ee7488027b6ab07a66054605e99e886827afa72708ce170398ea2fdfeec55455a7c0be8e779694026255f2166";
     88         //获取乡镇级信息列表
     89         static List<TownInfo> GetTownInfos(object cityurl)
     90         {
     91             List<TownInfo> townInfos = new List<TownInfo>();
     92             HttpGetHelper httpGetHelper = new HttpGetHelper() { Url =(string) cityurl, ContentType = "text/html; charset=gb2312", Encode = Encoding.GetEncoding(936),RequestMethod="post"};
     93             //HtmlAgilityPack类库解析HTML
     94             HtmlDocument document = new HtmlDocument();
     95             document.LoadHtml(httpGetHelper.GetHtml(1,ref cookies));
     96             //string html = httpGetHelper.GetHtml(ref cookies);
     97             //路径里"//"表示从根节点开始查找,两个斜杠‘//’表示查找所有childnodes;一个斜杠'/'表示只查找第一层的childnodes(即不查找grandchild);点斜杠"./"表示从当前结点而不是根结点开始查找
     98             HtmlNodeCollection htmlNodes = document.DocumentNode.SelectNodes("//tr[@class='towntr']");
     99             foreach (var node in htmlNodes)
    100             {
    101                 HtmlNodeCollection htmlNodes1 = node.SelectNodes("./td");
    102                 HtmlNode htmlNodeHref = node.SelectSingleNode(".//a[@href]");
    103                 HtmlAttribute htmlAttribute = htmlNodeHref.Attributes["href"];
    104                 TownInfo townInfo = new TownInfo(htmlNodes1[0].InnerText, htmlNodes1[1].InnerText,
    105                     (cityurl as string).Substring(0, (cityurl as string).LastIndexOf('/') + 1) + htmlAttribute.Value);
    106                 townInfos.Add(townInfo);
    107             }
    108             return townInfos;
    109         }
    110         //获取村级信息列表
    111         static List<VillageInfo> GetVillageInfos(object townurl)
    112         {
    113             List<VillageInfo> villageInfos = new List<VillageInfo>();
    114             HttpGetHelper httpGetHelper = new HttpGetHelper() { Url = (string)townurl, ContentType = "text/html; charset=gb2312", Encode = Encoding.GetEncoding(936), RequestMethod = "post"};
    115             HtmlDocument document = new HtmlDocument();
    116             document.LoadHtml(httpGetHelper.GetHtml(2,ref cookies));
    117             //string html = httpGetHelper.GetHtml(ref cookies);
    118             HtmlNodeCollection htmlNodes = document.DocumentNode.SelectNodes("//tr[@class='villagetr']");
    119             foreach (var node in htmlNodes)
    120             {
    121                 HtmlNodeCollection htmlNodes1 = node.SelectNodes(".//td");
    122                 VillageInfo villageInfo = new VillageInfo(htmlNodes1[0].InnerText,htmlNodes1[1].InnerText,htmlNodes1[2].InnerText);
    123                 villageInfos.Add(villageInfo);
    124             }
    125             return villageInfos;
    126         }
    127     }

    辅助类/结构

     1   internal class Cursor
     2     {
     3         const int origRow = 3;
     4         const int origCol = 0;
     5         public static void WriteAt(string s, int c, int r)
     6         {
     7             Console.SetCursorPosition(origCol + c, origRow + r);
     8             Console.Write(s);
     9         }
    10     }
    11     //乡镇信息结构 编码、名称、超链
    12     struct TownInfo
    13     {
    14         string code;
    15         public string Code{ get { return code; } }
    16         string name;
    17         public string Name{get { return name; } }
    18         string href;
    19         public string Href { get { return href; } }
    20         public TownInfo (string code,string name,string href)
    21         {
    22             this.code = code;
    23             this.name = name;
    24             this.href = href;
    25         }
    26     }
    27     //村信息结构 编码、城乡划分类,名称
    28     struct VillageInfo
    29     {
    30         string code;
    31         public string Code{ get { return code; } }
    32         string  cls;
    33         public string Cls{ get { return cls; } }
    34         string name;
    35         public string Name{ get { return name; } }
    36         public VillageInfo(string code,string cls,string name)
    37         {
    38             this.code = code;
    39             this.cls = cls;
    40             this.name = name;
    41         }
    42     }

    获取HTML

     1     public class HttpGetHelper
     2     {
     3         string url = string.Empty;
     4         public string Url
     5         {
     6             set { url = value; }
     7         }
     8 
     9         int timeOut=10*1000;
    10         public int Timeout
    11         {
    12             set { timeOut = value; }
    13         }
    14 
    15         string contentType= "text/html;charset=utf-8";
    16         public string ContentType
    17         {
    18             set { contentType = value; }
    19         }
    20 
    21         string userAgent= "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36 ";
    22         public string UserAgent
    23         {
    24             set { userAgent = value; }
    25         }
    26 
    27         Encoding encode=Encoding.UTF8;
    28         public Encoding Encode
    29         {
    30             set { encode = value; }
    31         }
    32         string request_Method = "get";
    33         public string RequestMethod
    34         {
    35             set { request_Method = value; }
    36         }
    37         /// <summary>
    38         /// get html content
    39         /// </summary>
    40         /// <param name="cls">town=1;village=2</param>
    41         /// <param name="cookies">if cls=1 then ref cookies</param>
    42         /// <returns></returns>
    43         public string GetHtml(int cls,ref string cookies)
    44         {
    45             string html = string.Empty;
    46             try
    47             {
    48                 if (url!=string.Empty)
    49                 {
    50                     HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest;
    51                     request.Timeout = this.timeOut;
    52                     request.ContentType = this.contentType;
    53                     request.UserAgent = this.userAgent;
    54                     request.Headers.Add(HttpRequestHeader.Cookie, cookies);
    55                     request.Method = request_Method;
    56                     using (HttpWebResponse response =request.GetResponse()as HttpWebResponse)
    57                     {
    58                         if (response.StatusCode==HttpStatusCode.OK)
    59                         {//如果是县级url,则记录cookie
    60                             if (cls==1)
    61                             {
    62                                 CookieCollection cookieCollection = response.Cookies;
    63                                 foreach (Cookie item in cookieCollection)
    64                                 {
    65                                     cookies = item.Name + "=" + item.Value + ";";
    66                                 }
    67                                 cookies.Remove(cookies.Length - 1);
    68                             }
    69 
    70                             using (StreamReader streamReader = new StreamReader(response.GetResponseStream(), encode))
    71                             {
    72                                 html = streamReader.ReadToEnd();
    73                                 streamReader.Close();
    74                             }
    75                         }
    76                     }
    77                 }
    78             }
    79             catch (Exception)
    80             {
    81                 throw new Exception($"GetHtml失败,url:{url}");
    82             }
    83             return html;
    84         }
    85     }
  • 相关阅读:
    USACO Milk2 区间合并
    Codeforces 490B Queue【模拟】
    HDU 3974 Assign the task 简单搜索
    HDU 5119 Happy Matt Friends(2014北京区域赛现场赛H题 裸背包DP)
    Cin、Cout 加快效率方法
    POJ 1159 回文LCS滚动数组优化
    POJ 2479 不相交最大子段和
    POJ 1458 最长公共子序列 LCS
    在阿里最深刻的,还是职场之道给我的震撼
    精细化
  • 原文地址:https://www.cnblogs.com/yzhyingcool/p/10705889.html
Copyright © 2011-2022 走看看