zoukankan      html  css  js  c++  java
  • 从统计局抓取2016年最新的全国区县数据!!

    using System;
    using System.Collections.Generic;
    using System.Web;
    using System.Web.UI;
    using System.Web.UI.WebControls;
    using HtmlAgilityPack;
    using System.Text;
    public partial class 抓取区县 : System.Web.UI.Page
    {
        protected void Page_Load(object sender, EventArgs e)
        {
    
        }
        protected void Button1_Click(object sender, EventArgs e)
        {
            string Url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/index.html";
            HtmlWeb hw = new HtmlWeb();
            hw.OverrideEncoding = System.Text.Encoding.GetEncoding("GB2312");
            HtmlDocument doc = hw.Load(Url); 
            doc.OptionReadEncoding = true;
    
    
    
            StringBuilder sb = new StringBuilder();
            HtmlNodeCollection hrefList = doc.DocumentNode.SelectNodes(".//a[@href]");
    
            if (hrefList != null)
            {
                foreach (HtmlNode href in hrefList)
                {
                    int dep = 1;//深度
    
                    HtmlAttribute att = href.Attributes["href"];
    
                    if (att.Value.Contains("miibeian"))//去除备案
                        continue;
    
                    string depurl = Url.Replace("index.html", att.Value);
    
                    Response.Write(href.InnerText + "(" + att.Value.Replace(".html","").PadRight(12, '0') + ")<br/>");// +
                    GetArea(depurl, dep, att.Value.Replace(".html", "").PadRight(12, '0'));
    
                   // break;
                }
    
            }
    
          //  GetArea("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/14/1401.html",2);
        }
    
    
        /// <summary>
        /// 递归读取方法
        /// </summary>
        /// <param name="Url">下级地址</param>
        /// <param name="dep">下级深度</param>
        /// <param name="parentCode">上级代码</param>
        void GetArea(string Url, int dep, string parentCode)
        {
            //递归读取方法
    
     
               
            HtmlWeb hw = new HtmlWeb();
            hw.OverrideEncoding = System.Text.Encoding.GetEncoding("GB2312");
            HtmlDocument doc = hw.Load(Url);//是你需要解析的url
            doc.OptionReadEncoding = true;
    
            dep++;
            StringBuilder sb = new StringBuilder();
            HtmlNodeCollection hrefList = doc.DocumentNode.SelectNodes(".//tr[@class='" + classname(dep) + "']/td[last()]");
    
            if (hrefList != null)
            {
               
                foreach (HtmlNode href in hrefList)
                {
                     
                    HtmlNodeCollection hrefNode = href.SelectNodes(".//a[@href]");
                    if (hrefNode != null)//链接存在
                    {
                        HtmlAttribute att = hrefNode[0].Attributes["href"];
    
                        if (att.Value.Contains("miibeian"))//去除备案
                            continue;
    
                        string codeNum = href.PreviousSibling.InnerText.PadRight(12, '0');
                        Response.Write(joinstr(dep) + href.InnerText + "(" + codeNum + ")<br/>");
    
                        string depurl = Url.Replace(parentCode.Substring(0, 2 * (dep-1)) + ".html", att.Value);
                        if(dep<4)//到乡镇即可
                        GetArea(depurl, dep, codeNum);
                    }
                    else
                    {
                        Response.Write(joinstr(dep) + href.InnerText + "(" + href.PreviousSibling.InnerText.PadRight(12, '0') + ")<br/>");
                    }
                }
    
            }
    
    
        }
    
        /// <summary>
        /// 分级连接符
        /// </summary>
        /// <param name="dep"></param>
        /// <returns></returns>
        string joinstr(int dep)
        {
            string str = "";
            for (int i = 1; i < dep; i++)
            {
                str += "-----|";
            }
            return str;
     
        }
        /// <summary>
        /// 不同级别不同的class名称
        /// </summary>
        /// <param name="dep"></param>
        /// <returns></returns>
        string classname(int dep)
        {
            string str = "";
            switch (dep)
    
            {
                case 2:
                    str = "citytr";
                    break;
                case 3:
                    str = "countytr";
                    break;
                case 4:
                    str = "towntr";
                    break;
                default:
                    break;
            };
    
            return str;
        }
    }

    以上是我写的完整代码,用的HtmlAgilityPack组件解析html,直接开源下载就可以。关键的数据都已经获得到 如:区号  和名称  

    你可以用来写入你的数据库即可。

    效果如下:

  • 相关阅读:
    5个人的晚餐
    replace(),indexOf(),substring(),split(),join(),——各种小知识点
    2016-11-01——孤独留给自己,开心让给别人
    CMS3.0——初次邂逅express
    项目三(集团官网)——总结(2) 递归创建子目录
    项目三(集团官网)——总结(1) cookie
    jQuery_pager.js分页
    thinkjs——修改where默认条件为or
    优化之——查询数据库
    go net库
  • 原文地址:https://www.cnblogs.com/efreer/p/6230624.html
Copyright © 2011-2022 走看看