zoukankan      html  css  js  c++  java
  • 给老子爬爬爬!2019国家统计局最新城乡划分代码

    爬一下最新的行政区划

    http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html

    引入依赖

    <!-- https://mvnrepository.com/artifact/com.belerweb/pinyin4j -->
    <dependency>
        <groupId>com.belerweb</groupId>
        <artifactId>pinyin4j</artifactId>
        <version>2.5.1</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.projectlombok/lombok -->
    <dependency>
        <groupId>org.projectlombok</groupId>
        <artifactId>lombok</artifactId>
        <version>1.18.8</version>
        <scope>provided</scope>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
    <dependency>
        <groupId>org.jsoup</groupId>
        <artifactId>jsoup</artifactId>
        <version>1.13.1</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/com.squareup.okhttp3/okhttp -->
    <dependency>
        <groupId>com.squareup.okhttp3</groupId>
        <artifactId>okhttp</artifactId>
        <version>4.4.1</version>
    </dependency>

    代码

    package com.demo.tools;
    
    import com.alibaba.fastjson.JSON;
    import lombok.Getter;
    import lombok.Setter;
    import lombok.extern.slf4j.Slf4j;
    import net.sourceforge.pinyin4j.PinyinHelper;
    import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
    import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
    import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
    import okhttp3.ConnectionPool;
    import okhttp3.OkHttpClient;
    import okhttp3.Request;
    import okhttp3.Response;
    import org.jsoup.Jsoup;
    import org.jsoup.nodes.Document;
    import org.jsoup.nodes.Element;
    import org.jsoup.select.Elements;
    
    import java.io.FileNotFoundException;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.util.*;
    import java.util.concurrent.TimeUnit;
    import java.util.regex.Pattern;
    
    /**
     * Created by 小LUA on 2020-03-30 11:39.
     */
    @Slf4j
    public class GetProvince {
    
        private static final OkHttpClient client = new OkHttpClient.Builder()
                .connectTimeout(5, TimeUnit.MINUTES)
                .writeTimeout(5, TimeUnit.MINUTES)
                .readTimeout(5, TimeUnit.MINUTES)
                .connectionPool(new ConnectionPool(0, 30, TimeUnit.MINUTES))
                .build();
    
        /**
         * 获取首字母
         * @param chinese
         * @return
         */
        private static String getFirstSpell(String chinese) {
            try {
                HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat();
                defaultFormat.setCaseType(HanyuPinyinCaseType.UPPERCASE);
                String[] temp = new String[0];
                try {
                    temp = PinyinHelper.toHanyuPinyinStringArray(chinese.charAt(0), defaultFormat);
                } catch (BadHanyuPinyinOutputFormatCombination badHanyuPinyinOutputFormatCombination) {
                    badHanyuPinyinOutputFormatCombination.printStackTrace();
                }
                return temp[0].charAt(0)+"";
            } catch (Exception e){
                return "";
            }
        }
    
        /**
         * 读取URL内容
         * @param url
         * @return
         * @throws IOException
         */
        public static String readUrl(String url) throws IOException {
            System.out.println("读取URL:" + url);
            Request request = new Request.Builder()
                    .url(url)
                    .build();
            Response response = client.newCall(request).execute();
            String body = new String(response.body().bytes(), "gb2312");
    //        System.out.println(body);
            return body;
        }
    
    
        public static void main(String[] args) throws Exception {
            // 正则
            Pattern pattern = Pattern.compile("[0-9]+");
    //        System.out.println(pattern.matcher("划代码").matches());
    //        System.out.println(pattern.matcher("110000").matches());
    
            List<Location> all = new ArrayList<>();
            String baseUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/";
            // 主页
            Elements provinceList = null;
            do {
                String indexContent = readUrl(baseUrl + "index.html");
                provinceList = Jsoup.parse(indexContent).getElementsByClass("provincetr");
                System.out.println("provinceList是否为空:" + provinceList.isEmpty());
            } while (provinceList.isEmpty());
    
            for (Element pElement : provinceList) {
                // 获取省名称 + 子地址
                Elements a = pElement.select("a");
                for (Element e : a) {
                    String pName = e.text();
                    String pHref = e.attr("href");
                    String pCode = pHref.substring(0, pHref.indexOf("."));
                    String cityUrl = baseUrl + pHref;
                    System.out.println(pName + "," + pCode + "," + cityUrl);
                    Long provinceCode = Long.valueOf(pCode + "0000");
    
                    // 设置省信息
                    Location provinceInfo = new Location();
                    provinceInfo.setCode(provinceCode);
                    provinceInfo.setName(pName);
                    provinceInfo.setLevel(1);
                    provinceInfo.setLetterSort(getFirstSpell(pName));
                    List<Location> cities = new ArrayList<>();
                    provinceInfo.setChilds(cities); // 省下面:市
                    all.add(provinceInfo);
    
                    // 读取城市页面
                    Elements cityList = null;
                    do {
                        String cityContent = readUrl(cityUrl);
                        cityList = Jsoup.parse(cityContent).getElementsByClass("citytr");
                        System.out.println("cityList是否为空:" + cityList.isEmpty());
                    } while (cityList.isEmpty());
    
    
                    for (Element cElement : cityList) {
                        Elements aa = cElement.select("a");
                        for (Element ee : aa) {
                            String cName = ee.text();
                            // 过滤掉比如name为110100000000的数据,只需要取汉字的
                            if (pattern.matcher(cName).matches()){
                                continue;
                            }
                            String cHref = ee.attr("href");
                            String cCode = cHref.substring(cHref.indexOf("/")+1, cHref.indexOf("."));
                            String countyUrl = baseUrl + cHref;
                            System.out.println(cName + "," + cCode + "," + countyUrl);
                            Long cityCode = Long.valueOf(cCode + "00");
    
                            // 设置城市信息
                            Location city = new Location();
                            city.setCode(cityCode);
                            city.setName(cName);
                            city.setLevel(2);
                            city.setLetterSort(getFirstSpell(cName));
                            List<Location> counties = new ArrayList<>();
                            city.setChilds(counties);  // 市下面:区
                            city.setParentCode(provinceCode);
                            cities.add(city);   // 添加到城市列表
    
    
                            // 读取区页面
                            Elements countyList = null;
                            do{
                                String countyContent = readUrl(countyUrl);
                                countyList = Jsoup.parse(countyContent).getElementsByClass("countytr");
                                if ("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/44/4419.html".equals(countyUrl) ||
                                "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/44/4420.html".equals(countyUrl) ||
                                "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/46/4604.html".equals(countyUrl)){
                                    countyList = Jsoup.parse(countyContent).getElementsByClass("towntr");
                                }
                                System.out.println("countyList是否为空:" + countyList.isEmpty());
                            } while (countyList.isEmpty());
    
                            for (Element aElement : countyList) {
                                Elements aaa = aElement.select("a");
                                for (Element eee : aaa) {
                                    String aName = eee.text();
                                    // 过滤掉比如name为110100000000的数据,只需要取汉字的
                                    if (pattern.matcher(aName).matches()){
                                        continue;
                                    }
                                    String aHref = eee.attr("href");
                                    String aCode = aHref.substring(aHref.indexOf("/")+1, aHref.indexOf("."));
                                    System.out.println(aName + "," + aCode);
    
                                    // 设置区信息
                                    Location county = new Location();
                                    county.setCode(Long.valueOf(aCode));
                                    county.setName(aName);
                                    county.setLevel(3);
                                    county.setLetterSort(getFirstSpell(aName));
                                    county.setParentCode(cityCode);
                                    counties.add(county);   // 添加到区列表
                                }
                            }
                        }
                    }
    
                }
            }
            String jsonString = JSON.toJSONString(all);
            System.out.println(jsonString);
            write(jsonString);
        }
    
        public static void write(String str) throws IOException {
            FileOutputStream out = new FileOutputStream("2019省市区-大陆.json");
            out.write(str.getBytes());
            out.flush();
            out.close();
        }
    
    }
    
    @Getter
    @Setter
    class Location{
        private Long code;
        private String name;
        private Integer level;
        private String letterSort;
        private Long parentCode;
        private List<Location> childs;
    }

    爬完数据我只是存在了json文件里了,如果你需要存到数据库,只需要对 all 进行处理即可。或者读文件再处理

    private static void read() throws IOException {
        FileInputStream in = new FileInputStream("2019省市区-大陆.json");
        BufferedReader reader = new BufferedReader(new InputStreamReader(in));
        StringBuilder sb = new StringBuilder();
        String t;
        while ((t = reader.readLine()) != null){
            sb.append(t);
        }
        List<Location> cities = JSONArray.parseArray(sb.toString(), Location.class);
        cities.forEach(e -> {
            // TODO
        });
    }

    另附:全部数据(很小一部分的名称爬下来就是乱码,导致首字母识别不出来需要手动改正,搜索【"letterSort": ""】)

    一共有6个,不算多。

    数据文件:https://github.com/Mysakura/DataFiles

    ============================================

    算了,我帮你们找出来了

    {
        "code": 341302,
        "letterSort": "Y",
        "level": 3,
        "name": "埇桥区",
        "parentCode": 341300
    }
    
    {
        "code": 410304,
        "letterSort": "C",
        "level": 3,
        "name": "瀍河回族区",
        "parentCode": 410300
    }
    
    {
        "code": 411502,
        "letterSort": "S",
        "level": 3,
        "name": "浉河区",
        "parentCode": 411500
    }
    
    {
        "code": 420104,
        "letterSort": "Q",
        "level": 3,
        "name": "硚口区",
        "parentCode": 420100
    }
    
    {
        "code": 420505,
        "letterSort": "X",
        "level": 3,
        "name": "猇亭区",
        "parentCode": 420500
    }
    
    {
        "code": 610118,
        "letterSort": "H",
        "level": 3,
        "name": "鄠邑区",
        "parentCode": 610100
    }
  • 相关阅读:
    Android 解决小米手机Android Studio安装app 报错的问题It is possible that this issue is resolved by uninstalling an existi
    Android Unresolved Dependencies
    Android studio 自定义打包apk名
    Android Fragment与Activity交互的几种方式
    魅族和三星Galaxy 5.0webView 问题Android Crash Report
    Android几种常见的多渠道(批量)打包方式介绍
    Android批量打包 如何一秒内打完几百个apk渠道包
    上周热点回顾(9.30-10.6)团队
    上周热点回顾(9.23-9.29)团队
    上周热点回顾(9.16-9.22)团队
  • 原文地址:https://www.cnblogs.com/LUA123/p/12603550.html
Copyright © 2011-2022 走看看