zoukankan      html  css  js  c++  java
  • 给老子爬爬爬!2019国家统计局最新城乡划分代码

    爬一下最新的行政区划

    http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html

    引入依赖

    <!-- https://mvnrepository.com/artifact/com.belerweb/pinyin4j -->
    <dependency>
        <groupId>com.belerweb</groupId>
        <artifactId>pinyin4j</artifactId>
        <version>2.5.1</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.projectlombok/lombok -->
    <dependency>
        <groupId>org.projectlombok</groupId>
        <artifactId>lombok</artifactId>
        <version>1.18.8</version>
        <scope>provided</scope>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
    <dependency>
        <groupId>org.jsoup</groupId>
        <artifactId>jsoup</artifactId>
        <version>1.13.1</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/com.squareup.okhttp3/okhttp -->
    <dependency>
        <groupId>com.squareup.okhttp3</groupId>
        <artifactId>okhttp</artifactId>
        <version>4.4.1</version>
    </dependency>

    代码

    package com.demo.tools;
    
    import com.alibaba.fastjson.JSON;
    import lombok.Getter;
    import lombok.Setter;
    import lombok.extern.slf4j.Slf4j;
    import net.sourceforge.pinyin4j.PinyinHelper;
    import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
    import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
    import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
    import okhttp3.ConnectionPool;
    import okhttp3.OkHttpClient;
    import okhttp3.Request;
    import okhttp3.Response;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    import java.io.FileNotFoundException;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.util.*;
    import java.util.concurrent.TimeUnit;
    import java.util.regex.Pattern;
    
    /**
     * Created by 小LUA on 2020-03-30 11:39.
     */
    @Slf4j
    public class GetProvince {
    
        private static final OkHttpClient client = new OkHttpClient.Builder()
                .connectTimeout(5, TimeUnit.MINUTES)
                .writeTimeout(5, TimeUnit.MINUTES)
                .readTimeout(5, TimeUnit.MINUTES)
                .connectionPool(new ConnectionPool(0, 30, TimeUnit.MINUTES))
                .build();
    
        /**
         * 获取首字母
         * @param chinese
         * @return
         */
        private static String getFirstSpell(String chinese) {
            try {
                HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat();
                defaultFormat.setCaseType(HanyuPinyinCaseType.UPPERCASE);
                String[] temp = new String[0];
                try {
                    temp = PinyinHelper.toHanyuPinyinStringArray(chinese.charAt(0), defaultFormat);
                } catch (BadHanyuPinyinOutputFormatCombination badHanyuPinyinOutputFormatCombination) {
                    badHanyuPinyinOutputFormatCombination.printStackTrace();
                }
                return temp[0].charAt(0)+"";
            } catch (Exception e){
                return "";
            }
        }
    
        /**
         * 读取URL内容
         * @param url
         * @return
         * @throws IOException
         */
        public static String readUrl(String url) throws IOException {
            System.out.println("读取URL:" + url);
            Request request = new Request.Builder()
                    .url(url)
                    .build();
            Response response = client.newCall(request).execute();
            String body = new String(response.body().bytes(), "gb2312");
    //        System.out.println(body);
            return body;
        }
    
    
        public static void main(String[] args) throws Exception {
            // 正则
            Pattern pattern = Pattern.compile("[0-9]+");
    //        System.out.println(pattern.matcher("划代码").matches());
    //        System.out.println(pattern.matcher("110000").matches());
    
            List<Location> all = new ArrayList<>();
            String baseUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/";
            // 主页
            Elements provinceList = null;
            do {
                String indexContent = readUrl(baseUrl + "index.html");
                provinceList = Jsoup.parse(indexContent).getElementsByClass("provincetr");
                System.out.println("provinceList是否为空:" + provinceList.isEmpty());
            } while (provinceList.isEmpty());
    
            for (Element pElement : provinceList) {
                // 获取省名称 + 子地址
                Elements a = pElement.select("a");
                for (Element e : a) {
                    String pName = e.text();
                    String pHref = e.attr("href");
                    String pCode = pHref.substring(0, pHref.indexOf("."));
                    String cityUrl = baseUrl + pHref;
                    System.out.println(pName + "," + pCode + "," + cityUrl);
                    Long provinceCode = Long.valueOf(pCode + "0000");
    
                    // 设置省信息
                    Location provinceInfo = new Location();
                    provinceInfo.setCode(provinceCode);
                    provinceInfo.setName(pName);
                    provinceInfo.setLevel(1);
                    provinceInfo.setLetterSort(getFirstSpell(pName));
                    List<Location> cities = new ArrayList<>();
                    provinceInfo.setChilds(cities); // 省下面:市
                    all.add(provinceInfo);
    
                    // 读取城市页面
                    Elements cityList = null;
                    do {
                        String cityContent = readUrl(cityUrl);
                        cityList = Jsoup.parse(cityContent).getElementsByClass("citytr");
                        System.out.println("cityList是否为空:" + cityList.isEmpty());
                    } while (cityList.isEmpty());
    
    
                    for (Element cElement : cityList) {
                        Elements aa = cElement.select("a");
                        for (Element ee : aa) {
                            String cName = ee.text();
                            // 过滤掉比如name为110100000000的数据,只需要取汉字的
                            if (pattern.matcher(cName).matches()){
                                continue;
                            }
                            String cHref = ee.attr("href");
                            String cCode = cHref.substring(cHref.indexOf("/")+1, cHref.indexOf("."));
                            String countyUrl = baseUrl + cHref;
                            System.out.println(cName + "," + cCode + "," + countyUrl);
                            Long cityCode = Long.valueOf(cCode + "00");
    
                            // 设置城市信息
                            Location city = new Location();
                            city.setCode(cityCode);
                            city.setName(cName);
                            city.setLevel(2);
                            city.setLetterSort(getFirstSpell(cName));
                            List<Location> counties = new ArrayList<>();
                            city.setChilds(counties);  // 市下面:区
                            city.setParentCode(provinceCode);
                            cities.add(city);   // 添加到城市列表
    
    
                            // 读取区页面
                            Elements countyList = null;
                            do{
                                String countyContent = readUrl(countyUrl);
                                countyList = Jsoup.parse(countyContent).getElementsByClass("countytr");
                                if ("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/44/4419.html".equals(countyUrl) ||
                                "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/44/4420.html".equals(countyUrl) ||
                                "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/46/4604.html".equals(countyUrl)){
                                    countyList = Jsoup.parse(countyContent).getElementsByClass("towntr");
                                }
                                System.out.println("countyList是否为空:" + countyList.isEmpty());
                            } while (countyList.isEmpty());
    
                            for (Element aElement : countyList) {
                                Elements aaa = aElement.select("a");
                                for (Element eee : aaa) {
                                    String aName = eee.text();
                                    // 过滤掉比如name为110100000000的数据,只需要取汉字的
                                    if (pattern.matcher(aName).matches()){
                                        continue;
                                    }
                                    String aHref = eee.attr("href");
                                    String aCode = aHref.substring(aHref.indexOf("/")+1, aHref.indexOf("."));
                                    System.out.println(aName + "," + aCode);
    
                                    // 设置区信息
                                    Location county = new Location();
                                    county.setCode(Long.valueOf(aCode));
                                    county.setName(aName);
                                    county.setLevel(3);
                                    county.setLetterSort(getFirstSpell(aName));
                                    county.setParentCode(cityCode);
                                    counties.add(county);   // 添加到区列表
                                }
                            }
                        }
                    }
    
                }
            }
            String jsonString = JSON.toJSONString(all);
            System.out.println(jsonString);
            write(jsonString);
        }
    
        public static void write(String str) throws IOException {
            FileOutputStream out = new FileOutputStream("2019省市区-大陆.json");
            out.write(str.getBytes());
            out.flush();
            out.close();
        }
    
    }
    
    @Getter
    @Setter
    class Location{
        private Long code;
        private String name;
        private Integer level;
        private String letterSort;
        private Long parentCode;
        private List<Location> childs;
    }

    爬完数据我只是存在了json文件里了,如果你需要存到数据库,只需要对 all 进行处理即可。或者读文件再处理

    private static void read() throws IOException {
        FileInputStream in = new FileInputStream("2019省市区-大陆.json");
        BufferedReader reader = new BufferedReader(new InputStreamReader(in));
        StringBuilder sb = new StringBuilder();
        String t;
        while ((t = reader.readLine()) != null){
            sb.append(t);
        }
        List<Location> cities = JSONArray.parseArray(sb.toString(), Location.class);
        cities.forEach(e -> {
            // TODO
        });
    }

    另附:全部数据(很小一部分的名称爬下来就是乱码,导致首字母识别不出来需要手动改正,搜索【"letterSort": ""】)

    一共有6个,不算多。

    数据文件:https://github.com/Mysakura/DataFiles

    ============================================

    算了,我帮你们找出来了

    {
        "code": 341302,
        "letterSort": "Y",
        "level": 3,
        "name": "埇桥区",
        "parentCode": 341300
    }
    
    {
        "code": 410304,
        "letterSort": "C",
        "level": 3,
        "name": "瀍河回族区",
        "parentCode": 410300
    }
    
    {
        "code": 411502,
        "letterSort": "S",
        "level": 3,
        "name": "浉河区",
        "parentCode": 411500
    }
    
    {
        "code": 420104,
        "letterSort": "Q",
        "level": 3,
        "name": "硚口区",
        "parentCode": 420100
    }
    
    {
        "code": 420505,
        "letterSort": "X",
        "level": 3,
        "name": "猇亭区",
        "parentCode": 420500
    }
    
    {
        "code": 610118,
        "letterSort": "H",
        "level": 3,
        "name": "鄠邑区",
        "parentCode": 610100
    }
  • 相关阅读:
    POJ3977 Subset 折半枚举
    Ubuntu和Win7双系统,ubuntu被删,重新启动之后显示,no such partition
    hdu 4296 贪心
    Python标准库:内置函数tuple([iterable])
    【python自制】让大白成为你的个人助手!
    Flex设置LinkButton的背景色
    VB6基本数据库应用(五):数据的查找与筛选
    正态分布(normal distribution)与偏态分布(skewed distribution)
    windows 系统文件 —— 特殊文件及文件类型
    windows 系统文件 —— 特殊文件及文件类型
  • 原文地址:https://www.cnblogs.com/LUA123/p/12603550.html
Copyright © 2011-2022 走看看