zoukankan      html  css  js  c++  java
  • C#网页爬虫抓取行政区划

    借鉴C#网页爬虫抓取行政区划,从国家统计局获取了最新行政区域数据。

    以下为代码贴片:

    数据库类:

    public class City {
        public decimal ID { get; set; }
        public string Name { get; set; }
        public string Code { get; set; }
        public string Org_Level { get; set; }
        public string ParentCode { get; set; }
        public decimal ParentID { get; set; }
        public string Contry { get; set; }
        public string Loc_x { get; set; }
        public string Loc_y { get; set; }
      }

    获取网页帮助类:

     1  public class HttpHelper {
     2     private static ILog log = log4net.LogManager.GetLogger(typeof(HttpHelper));
     3 
     4     public static string DownloadHtml(string url,Encoding encod) {
     5       string html = string.Empty;
     6       try {
     7         //设置请求参数
     8         HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest;
     9         request.Timeout = 10 * 1000;//10s超时
    10         request.ContentType = "text/html;charset=utf-8";
    11         request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36";
    12         
    13         //获取结果
    14         using(HttpWebResponse resp = request.GetResponse() as HttpWebResponse) {
    15           if(resp.StatusCode != HttpStatusCode.OK) {
    16             log.Fatal(string.Format("抓取{0}地址返回失败,response.StatusCode = {1}",url,resp.StatusCode));
    17           } else {
    18             try {
    19               StreamReader sr = new StreamReader(resp.GetResponseStream(),encod);
    20               html = sr.ReadToEnd();
    21               sr.Close();
    22             } catch(Exception e) {
    23               log.Fatal(string.Format("DownLoadHtml抓取html{0}保存失败",url),e);
    24               
    25             }
    26           }
    27         }
    28       } catch(Exception e) {
    29         if(e.Message.Equals("远程服务器返回错误:(306)。")) {
    30         }
    31         log.Fatal(e);
    32       } finally {
    33       }
    34       return html;
    35     }
    36   }

    数据库保存帮助类:

      public class SQLHelper {
        
        /// 一个有效的数据库连接对象 
        /// 命令类型(存储过程,命令文本或其它.) 
        /// T存储过程名称或T-SQL语句 
        /// SqlParamter参数数组 
        /// 返回影响的行数 
        public static int ExecuteNonQueryForCity(List<City> cityList) {
          int count = 0;
          //string dbConnectStr = System.Configuration.ConfigurationSettings.AppSettings["DBContext"].ToString();
          var connectionString = System.Configuration.ConfigurationManager.ConnectionStrings["DBContext"].ConnectionString;
          using(SqlConnection connection = new SqlConnection(connectionString)) {
            if(connection.State != ConnectionState.Open) {
              connection.Open();
            }
            // 创建SqlCommand命令,并进行预处理 
            using(SqlCommand cmd = new SqlCommand()) {
              cmd.Connection = connection;
              cmd.CommandText = "insert into base_city(ID,name,Code,Contry,Loc_x,Loc_y,Org_Level,ParentCode,ParentID,state) values(@ID,@name,@Code,@Contry,@Loc_x,@Loc_y,@Org_Level,@ParentCode,@ParentID,@state)";
              foreach(var city in cityList) {
                try {
                  if(string.IsNullOrEmpty(city.Name))
                    city.Name = "";
                  if(string.IsNullOrEmpty(city.Code))
                    city.Code = "";
                  if(string.IsNullOrEmpty(city.Contry))
                    city.Contry = "";
                  if(string.IsNullOrEmpty(city.Loc_x))
                    city.Loc_x = "";
                  if(string.IsNullOrEmpty(city.Loc_y))
                    city.Loc_y = "";
                  if(string.IsNullOrEmpty(city.Org_Level))
                    city.Org_Level = "";
                  if(string.IsNullOrEmpty(city.ParentCode))
                    city.ParentCode = "";
    
                  cmd.Parameters.Add(new SqlParameter("@ID",city.ID));
                  cmd.Parameters.Add(new SqlParameter("@name",city.Name));
                  cmd.Parameters.Add(new SqlParameter("@Code",city.Code));
                  cmd.Parameters.Add(new SqlParameter("@Contry",city.Contry));             
                  cmd.Parameters.Add(new SqlParameter("@Loc_x",city.Loc_x));  
                  cmd.Parameters.Add(new SqlParameter("@Loc_y",city.Loc_y));
                  cmd.Parameters.Add(new SqlParameter("@Org_Level",city.Org_Level));
                  cmd.Parameters.Add(new SqlParameter("@ParentCode",city.ParentCode));
                  cmd.Parameters.Add(new SqlParameter("@ParentID",city.ParentID));
                  cmd.Parameters.Add(new SqlParameter("@state","1"));
                  // Finally, execute the command 
                  int retval = cmd.ExecuteNonQuery();
                  if(retval == 0) {
                    Console.WriteLine("插入错误:");
                  }
                  count += retval;
                } catch(Exception e) {
                  Console.WriteLine("插入错误:" + e.Message);
                }
                // 清除参数,以便再次使用. 
                cmd.Parameters.Clear();
              }
            }
            connection.Close();
          }
          return count;
        }
      }

    抓取数据:

     public class 省市县数据抓取 {
        private ILog log = log4net.LogManager.GetLogger(typeof(省市县数据抓取));
        public const string UrlStr = "http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201703/t20170310_1471429.html";
        public List<City> SaveList = new List<City>();
        public 省市县数据抓取() {
          try {
            log.Info("抓取数据");
            string HtmlStr = HttpHelper.DownloadHtml(UrlStr,Encoding.UTF8);
            HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
            doc.LoadHtml(HtmlStr);
            //string goodsListPath = "//*[@id='J_goodsList']";
            //HtmlNode goodsListNode = doc.DocumentNode.SelectSingleNode(goodsListPath);
            string liPath = "//p[@class='MsoNormal']";
            HtmlNodeCollection goodsNodeCollection = doc.DocumentNode.SelectNodes(liPath);
    
            City c = new City() { 
              ID=1,
              Name = "全国",
              Code = "100000",
              Contry = "China",
              Org_Level = "1"
            };
            SaveList.Add(c);
            foreach(var item in goodsNodeCollection) {
              var firstNode = item.FirstChild;
              if(firstNode.Name == "b")
                GetProvince(item);
              else if(firstNode.InnerText == " ") {
                GetCity(item);
              } else if(firstNode.InnerText == "  ") {
                GetCounty(item);
              }
            }
    
          } catch(Exception e) {
            log.Info("last child code:" + SaveList.Last().Code);
            log.Info(e);
            throw (e);
          }
        }
    
        private void GetCounty(HtmlNode item) {
          City c = new City();
          c.Code = item.ChildNodes[1].InnerText.Replace(" ","").Trim();
          c.Name = item.ChildNodes[2].InnerText.Trim();
          c.Org_Level = "4";
          c.ID = SaveList.Last().ID + 1;
          var pc = SaveList.Last(i => i.Org_Level == "3");
          c.ParentCode = pc.Code;
          c.ParentID = pc.ID;
          c.Contry = "China";
          //if(c.Name == "市辖区")
          //  return;
          SaveList.Add(c);
        }
    
        private void GetCity(HtmlNode item) {
          City c = new City();
          c.Code = item.ChildNodes[1].InnerText.Replace(" ","").Trim();
          c.Name = item.ChildNodes[2].InnerText.Trim();     
          c.Org_Level = "3";
          c.ID = SaveList.Last().ID + 1;
          var pc = SaveList.Last(i => i.Org_Level == "2");
          c.ParentCode = pc.Code;
          c.ParentID = pc.ID;
          c.Contry = "China";
          SaveList.Add(c);
    
        }
    
        private void GetProvince(HtmlNode item) {
          City c = new City();
          c.Code = item.ChildNodes[0].FirstChild.InnerText.Replace(" ","").Trim();
          c.Name = item.ChildNodes[1].FirstChild.InnerText.Trim();
          c.Org_Level = "2";
          c.ID = SaveList.Last().ID + 1;
          var pc = SaveList.Last(i => i.Org_Level == "1");
          c.ParentCode = pc.Code;
          c.ParentID = pc.ID;
          c.Contry = "China";
          SaveList.Add(c);
        }
    
        public void Save() {
          log.Info("保存数据");
          SQLHelper.ExecuteNonQueryForCity(SaveList);
        }
      }
    
    

    全国 Org_Level =1

    省 Org_Level =2

    市 Org_Level =3

    县 Org_Level =4

    SaveList 首先添加了一个全国属性城市,Org_Level =1

    因为网页数据读取是从  省->市->县  ->省->市->县  这样循环读取的,所以在获取省、市、县的父级时,可以直接从SaveList 获取最后一个上一级别的对象即可

    执行类:

    省市县数据抓取 CityCatch = new 省市县数据抓取();
    CityCatch.Save();

    获取的数据如下:

     

  • 相关阅读:
    WP7 操作XML文件
    C#和C/C++指针实现swap交换
    感受
    我学到了什么&nbsp;我思考了什么.
    hdu 2768 Cat vs. Dog (最大独立)
    hdu 1960 Taxi Cab Scheme (最小覆盖)
    hdu 1528 Card Game Cheater (最小覆盖)
    hdu 4160 Dolls (最大独立)
    hdu 2458 Kindergarten (最大独立集)
    hdu 2119 Matrix (最小覆盖)
  • 原文地址:https://www.cnblogs.com/managersi/p/6941218.html
Copyright © 2011-2022 走看看