zoukankan      html  css  js  c++  java
  • C#爬取国家统计局五级地址

    // http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html 
    // 我这里是从省开始往下爬的,如果需要一次性爬取所有省的数据,得改一下从外一层开始爬
    // 地址 
    public string url;
    // 存储表名
    public string dbname;
    // 省级编码
    public string code;
    // 省名称
    public string name;
    // 数据库名称
    public static string database = "TEST";
    // 处理连接超时等意外断开情况
    public int flag = 0;
    public void ProcessRequest(HttpContext context)
    {
        url= System.Web.HttpUtility.HtmlDecode(System.Web.HttpContext.Current.Request.Form["url"]);
        TableExist(dbname);
        Provincial();
        City();
        County();
        Town();
        Village();
        context.Response.Write("爬取成功");
        
    }
    public void TableExist(string dbname) {
        DataTable dt = bll.SelectbySql("SELECT table_name FROM information_schema.TABLES WHERE table_name ='" + dbname + "'");
        if (dt.Rows.Count <= 0) {
            string sql =
                "USE [" + database + "]
    " +
                "SET ANSI_NULLS ON
    " +
                "SET QUOTED_IDENTIFIER ON
    " +
                "CREATE TABLE[dbo].[" + dbname + "](" +
                    "[ID][int] IDENTITY(1, 1) NOT NULL," +
                    "[Code] [nvarchar] (20) NULL," +
                    "[ParentCode] [nvarchar] (20) NULL," +
                    "[Name] [nvarchar] (50) NULL," +
                    "[Path] [nvarchar] (100) NULL," +
                    "[PathName] [nvarchar] (200) NULL," +
                    "[Levels] [int] NULL," +
                    "[Urls]" +
                        "[nvarchar]" +
                        "(max) NULL," +
    
                    "[DeleteMark] [bit] NULL," +
                    "CONSTRAINT[PK_" + dbname + "] PRIMARY KEY CLUSTERED" +
                "(" +
                    "[ID] ASC" +
                ")WITH(PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON) ON[PRIMARY]" +
                ") ON[PRIMARY] TEXTIMAGE_ON[PRIMARY]
    ";
                //"GO";
            bll.RunbySql(sql);
        }
    }
    public void Provincial()
    {
        bll.RunbySql("insert into " + dbname + " values('" + code + "','0','" + name + "','" + code + "','" + name + "',0,'" + url + "',0)");
    }
    public void City()
    {
        DataTable dt = bll.SelectbySql("select * from " + dbname + " where DeleteMark=0 and Levels=0 and Urls is not null and Urls<>''");
        if (dt.Rows.Count > 0)
        {
            for (int i = 0; i < dt.Rows.Count; i++)
            {
                String HtmlString = returnHtml(dt.Rows[i]["Urls"].ToString());
                NSoup.Nodes.Document doc = NSoup.NSoupClient.Parse(HtmlString);
                //先获取id为artContent的元素,再获取所有的p标签
                Elements lists = doc.GetElementsByClass("citytr");
                foreach (Element element in lists)
                {
                    //td节点,包括路径和编码
                    Element elements_code = element.Children[0];
                    //td节点,包括路径和名称
                    Element elements_name = element.Children[1];
                    if (elements_code.Children.Count > 0)
                    {
                        elements_code = elements_code.Children[0];
                        elements_name = elements_name.Children[0];
                    }
    
                    string newurls = "";
                    if (elements_code.HasAttr("href")) {
                        string thisUrl = dt.Rows[i]["Urls"].ToString();
                        newurls = thisUrl.Substring(0, thisUrl.LastIndexOf('/') + 1) + elements_code.Attr("href");
                    }
    
                    bll.RunbySql("insert into " + dbname + " values('" + elements_code.Text() + "','"+ dt.Rows[i]["Code"].ToString() + "','" + elements_name.Text() + "','" + dt.Rows[i]["Path"].ToString() + "/" + elements_code.Text() + "','" + dt.Rows[i]["PathName"].ToString() + "/" + elements_name.Text() + "',1,'" + newurls + "',0)");
                }
            }
        }
    }
    public void County()
    {
        DataTable dt = bll.SelectbySql("select * from " + dbname + " where DeleteMark=0 and Levels=1 and Urls is not null and Urls<>''");
        if (dt.Rows.Count > 0)
        {
            for (int i = 0; i < dt.Rows.Count; i++)
            {
                String HtmlString = returnHtml(dt.Rows[i]["Urls"].ToString());
                NSoup.Nodes.Document doc = NSoup.NSoupClient.Parse(HtmlString);
                //先获取id为artContent的元素,再获取所有的p标签
                Elements lists = doc.GetElementsByClass("countytr");
                foreach (Element element in lists)
                {
                    //td节点,包括路径和编码
                    Element elements_code = element.Children[0];
                    //td节点,包括路径和名称
                    Element elements_name = element.Children[1];
                    if (elements_code.Children.Count > 0)
                    {
                        elements_code = elements_code.Children[0];
                        elements_name = elements_name.Children[0];
                    }
    
                    string newurls = "";
                    if (elements_code.HasAttr("href"))
                    {
                        string thisUrl = dt.Rows[i]["Urls"].ToString();
                        newurls = thisUrl.Substring(0, thisUrl.LastIndexOf('/') + 1) + elements_code.Attr("href");
                    }
    
                    bll.RunbySql("insert into " + dbname + " values('" + elements_code.Text() + "','" + dt.Rows[i]["Code"].ToString() + "','" + elements_name.Text() + "','" + dt.Rows[i]["Path"].ToString() + "/" + elements_code.Text() + "','" + dt.Rows[i]["PathName"].ToString() + "/" + elements_name.Text() + "',2,'" + newurls + "',0)");
                }
            }
        }
    }
    public void Town()
    {
        DataTable dt = bll.SelectbySql("select * from " + dbname + " where DeleteMark=0 and Levels=2 and Urls is not null and Urls<>''");
        if (dt.Rows.Count > 0)
        {
            for (int i = 0; i < dt.Rows.Count; i++)
            {
                String HtmlString = returnHtml(dt.Rows[i]["Urls"].ToString());
                NSoup.Nodes.Document doc = NSoup.NSoupClient.Parse(HtmlString);
                //先获取id为artContent的元素,再获取所有的p标签
                Elements lists = doc.GetElementsByClass("towntr");
                foreach (Element element in lists)
                {
                    //td节点,包括路径和编码
                    Element elements_code = element.Children[0];
                    //td节点,包括路径和名称
                    Element elements_name = element.Children[1];
                    if (elements_code.Children.Count > 0)
                    {
                        elements_code = elements_code.Children[0];
                        elements_name = elements_name.Children[0];
                    }
    
                    string newurls = "";
                    if (elements_code.HasAttr("href"))
                    {
                        string thisUrl = dt.Rows[i]["Urls"].ToString();
                        newurls = thisUrl.Substring(0, thisUrl.LastIndexOf('/') + 1) + elements_code.Attr("href");
                    }
    
                    bll.RunbySql("insert into " + dbname + " values('" + elements_code.Text() + "','" + dt.Rows[i]["Code"].ToString() + "','" + elements_name.Text() + "','" + dt.Rows[i]["Path"].ToString() + "/" + elements_code.Text() + "','" + dt.Rows[i]["PathName"].ToString() + "/" + elements_name.Text() + "',3,'" + newurls + "',0)");
                }
            }
        }
    }
    public void Village()
    {
        DataTable dt = bll.SelectbySql("select * from " + dbname + " where DeleteMark=0 and Levels=3 and Urls is not null and Urls<>''");
        if (dt.Rows.Count > 0)
        {
            for (int i = 0; i < dt.Rows.Count; i++)
            {
                String HtmlString = returnHtml(dt.Rows[i]["Urls"].ToString());
                NSoup.Nodes.Document doc = NSoup.NSoupClient.Parse(HtmlString);
                //先获取id为artContent的元素,再获取所有的p标签
                Elements lists = doc.GetElementsByClass("villagetr");
                foreach (Element element in lists)
                {
                    //td节点,包括路径和编码
                    Element elements_code = element.Children[0];
                    //td节点,包括路径和名称
                    Element elements_name = element.Children[2];
    
                    bll.RunbySql("insert into " + dbname + " values('" + elements_code.Text() + "','" + dt.Rows[i]["Code"].ToString() + "','" + elements_name.Text() + "','" + dt.Rows[i]["Path"].ToString() + "/" + elements_code.Text() + "','" + dt.Rows[i]["PathName"].ToString() + "/" + elements_name.Text() + "',4,'',0)");
                }
            }
        }
    }
    public string returnHtml(string Urls) {
        String HtmlString = "";
        try
        {
            WebClient webClient = new WebClient();
            HtmlString = Encoding.GetEncoding("gb2312").GetString(webClient.DownloadData(Urls));
            flag = 0;
            return HtmlString;
        }
        catch
        {
            flag++;
            if (flag <= 10) {
                return returnHtml(Urls);
            }
            else
            {
                return HtmlString;
            }
        } 
    }
  • 相关阅读:
    XML炸弹
    IP分片攻击——就是发送部分分片报文,让对方一直等待从而耗对方内存的DoS攻击
    灰色软件——广告软件,拨号软件,远程访问软件等
    rootkit——一种特殊的恶意软件,它的功能是在安装目标上隐藏自身及指定的文件、进程和网络链接等信息,一般都和木马、后门等其他恶意程序结合使用
    漏洞利用 Exploit---利用默认口令、IP假冒、应用漏洞
    SequenceFile文件
    随机森林和GBDT的几个核心问题
    机器学习中的算法(1)-决策树模型组合之随机森林与GBDT
    直方图中最大矩形面积
    openMP多线程编程
  • 原文地址:https://www.cnblogs.com/tenfly/p/14435772.html
Copyright © 2011-2022 走看看