zoukankan      html  css  js  c++  java
  • 抓取国家地区基础数据

    一、介绍

    1.1 背景

    项目中地区基础数据从上线后再也没有更新过,与目前的国家地区数据不一致,需要做同步操作。

    1.2 解决方案

    通过采用爬虫手段从 国家统计局官网 抓取数据做同步。

    采用 WebMagic 框架实现操作

    二、操作

    2.1 创建本地临时表

    CREATE TABLE `ba_region` (  
      `region_id` varchar(10) COLLATE utf8mb4_bin NOT NULL COMMENT '地区代码',
      `region_name` varchar(255) COLLATE utf8mb4_bin NOT NULL COMMENT '地区名称',
      `region_type` varchar(2) COLLATE utf8mb4_bin NOT NULL COMMENT '地区级别',
      `region_Parent` varchar(10) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '上级地区代码',
      `province_id` varchar(10) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '省代码',
      `province_name` varchar(255) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '省名称',
      `city_id` varchar(10) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '市代码',
      `city_name` varchar(255) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '市名称',
      `district_id` varchar(10) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '区/县代码',
      `district_name` varchar(255) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '区/县名称',
      `street_id` varchar(10) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '乡/镇/街代码',
      `street_name` varchar(255) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '乡/镇/街名称',
      `creator` varchar(30) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '创建人员',
      `create_time` datetime DEFAULT NULL COMMENT '创建时间',
      `modifier` varchar(30) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '修改人员',
      `modify_time` datetime DEFAULT NULL COMMENT '修改时间',
      `ip_address` varchar(30) COLLATE utf8mb4_bin DEFAULT NULL COMMENT 'IP地址',
      PRIMARY KEY (`id`),
      KEY `inx_region_parent` (`region_Parent`) USING BTREE,
      KEY `inx_region_type` (`region_type`) USING BTREE
    ) ENGINE=InnoDB AUTO_INCREMENT=47122 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin COMMENT='地区基础数据';
    

    2.2 添加相关依赖和配置

    <dependencies>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-test</artifactId>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.7.3</version>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.7.3</version>
            <exclusions>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>slf4j-log4j12</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
    
        <dependency>
            <groupId>org.mybatis.spring.boot</groupId>
            <artifactId>mybatis-spring-boot-starter</artifactId>
            <version>2.0.0</version>
        </dependency>
    
        <!--mysql 驱动依赖-->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <scope>runtime</scope>
        </dependency>
        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
            <version>1.18.10</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
           <groupId>org.apache.httpcomponents</groupId>
           <artifactId>httpclient</artifactId>
           <version>4.5.2</version>
       </dependency>
       <dependency>
           <groupId>org.apache.httpcomponents</groupId>
           <artifactId>httpcore</artifactId>
           <version>4.4.4</version>
       </dependency>
    </dependencies>
    

    数据库配置:

    spring:
      datasource:
        driver-class-name: com.mysql.cj.jdbc.Driver
        url: jdbc:mysql://localhost:3306/test?serverTimezone=UTC&useUnicode=true&useSSL=false&useUnicode=true&characterEncoding=utf8
        username: root
        password: sa000
    

    2.3 实现抓取逻辑

    1. 新增自定义页面处理组件
    public class RegionPageProcessor implements PageProcessor {
    
        private Site site = Site.me()
                .setCharset("GBK")
                // 设置置重试次数
                .setRetryTimes(3)
                // 设置循环重试次数
                .setCycleRetryTimes(3)
                // 设置抓取间隔
                .setSleepTime(100)
                // 设置超时时间,单位毫秒
                .setTimeOut(10000);
    
        private String SPIDER_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/";
    
    
        /**
         * 获取搜寻器配置
         *
         * @return
         */
        @Override
        public Site getSite() {
            return site;
        }
    
        /**
         * 处理
         *
         * @param page
         */
        @Override
        public void process(Page page) {
            page.setCharset("UTF-8");
    
            final Map<RegionType, List<Selectable>> map = groupingNode(page);
            for (RegionType regionType : map.keySet()) {
                final List<Selectable> selectables = map.get(regionType);
                if (selectables == null || selectables.size() <= 0) {
                    continue;
                }
    
                List<BaRegion> regionList = conversion(page, regionType, selectables);
                if (regionList.size() <= 0) {
                    continue;
                }
                page.putField("data", regionList);
            }
    
        }
    
    
        private Map<RegionType, List<Selectable>> groupingNode(Page page) {
            Map<RegionType, List<Selectable>> map = new HashMap<>();
            addNodeGroup(map, page, PROVINCE);
            addNodeGroup(map, page, RegionType.CITY);
            addNodeGroup(map, page, RegionType.COUNTY);
            addNodeGroup(map, page, RegionType.TOWN);
            return map;
        }
    
        private void addNodeGroup(Map<RegionType, List<Selectable>> map, Page page, RegionType regionType) {
            final List<Selectable> cityNodes = page.getHtml().xpath("//tr[@class='" + regionType.getClassName() + "']/td").nodes();
            if (cityNodes != null && cityNodes.size() > 0) {
                map.put(regionType, cityNodes);
            }
        }
    
    
        private List<BaRegion> conversion(Page page, RegionType regionType, List<Selectable> selectables) {
            List<BaRegion> result = new ArrayList<>();
            String code = "";
            String nodeCode = "";
            for (Selectable selectable : selectables) {
                final List<String> all = selectable.xpath("//*/text()").all();
                if (all == null || all.size() <= 0) {
                    continue;
                }
                String nodeName = all.get(0);
                if (all.size() > 1) {
                    nodeName = all.get(1);
                }
    
                if (StringUtils.isBlank(nodeName)) {
                    continue;
                }
                if (compile("[0-9]*").matcher(nodeName).matches()) {
                    code = nodeName;
                    continue;
                }
                final List<String> links = selectable.links().all();
                // 部分市辖区下面没有区域
                if (links.size() <= 0) {
                    if (StringUtils.isBlank(code)) {
                        continue;
                    }
                    if (RegionType.COUNTY.equals(regionType)) {
                        nodeCode = (code.substring(0, 2) + "/" + code.substring(2, 4) + "/" + code.substring(0, 6));
                    }
    
                } else {
                    // 获取下一级 URL
                    String newUrl = links.get(0);
                    boolean nextLevelSwitch = !RegionType.TOWN.equals(regionType) && StringUtils.isNotBlank(newUrl);
                    if (nextLevelSwitch) {
                        page.addTargetRequest(newUrl);
                    }
                    nodeCode = newUrl.replace(SPIDER_URL, "").replace(".html", "");
    
                }
                if (StringUtils.isBlank(nodeCode)) {
                    continue;
                }
                // 组装数据
                BaRegion baRegion = toBaRegion(regionType, nodeCode, nodeName);
                result.add(baRegion);
            }
            return result;
        }
    
        private BaRegion toBaRegion(RegionType regionType, String code, String nodeName) {
            BaRegion region = new BaRegion();
            region.setRegionType(regionType.getCode());
            region.setRegionName(nodeName);
    
            String[] split;
            String parentId;
            String regionId;
    
            switch (regionType) {
                case PROVINCE:
                    region.setRegionId(code + "0000");
                    region.setRegionParent("1000");
                    break;
                case CITY:
                    split = code.split("/");
                    parentId = split[0] + "0000";
                    regionId = split[split.length - 1] + "00";
                    region.setRegionId(regionId);
                    region.setRegionParent(parentId);
                    break;
                case COUNTY:
                    split = code.split("/");
                    parentId = split[0] + split[1] + "00";
                    regionId = split[2];
                    region.setRegionId(regionId);
                    region.setRegionParent(parentId);
                    break;
                case TOWN:
                    split = code.split("/");
                    if (split.length == 3) {// 针对上一级直接是城市的地区
                        parentId = split[0] + split[1] + "00";
                        regionId = split[2];
                    } else {
                        parentId = split[0] + split[1] + split[2];
                        regionId = split[3];
                    }
                    region.setRegionId(regionId);
                    region.setRegionParent(parentId);
                    break;
                default:
                    break;
            }
            return region;
        }
    
    
    }
    

    涉及枚举类:

    public enum RegionType {
        PROVINCE("10", "provincetr", "省份"),
        CITY("20", "citytr", "城市"),
        COUNTY("30", "countytr", "区/县"),
        TOWN("40", "towntr", "镇/街道"),
        VILLAGE("50", "villagetr", "居委会"),
        ;
    
        private String code;
        private String className;
        private String desc;
    
        RegionType(String code, String className, String desc) {
            this.code = code;
            this.className = className;
            this.desc = desc;
        }
    
        public String getCode() {
            return code;
        }
    
        public String getDesc() {
            return desc;
        }
    
        public String getClassName() {
            return className;
        }
    }
    
    
    1. 实现结果持久化到数据库
    @Component
    public class SqlPipeline implements Pipeline {
    
        @Autowired
        private BaRegionMapper baRegionMapper;
    
        @Override
        public void process(ResultItems resultItems, Task task) {
            List<BaRegion> baRegionList = resultItems.get("data");
            if (baRegionList == null || baRegionList.size() <= 0) {
                System.out.println("抓取数据不存在");
                return;
            }
            for (BaRegion baRegion : baRegionList) {
                baRegionMapper.add(baRegion);
            }
        }
    }
    

    DAO 实现如下:

    @Mapper
    public interface BaRegionMapper {
        @Insert("INSERT INTO ba_region VALUES(#{regionId},#{regionName},#{regionType},#{regionParent}," +
                "#{provinceId},#{provinceName},#{cityId},#{cityName},#{districtId},#{districtName},#{streetId}," +
                "#{streetName},#{creator},#{createTime},#{modifier},#{modifyTime},#{ipAddress})")
        int add(BaRegion region);
    }
    
    1. 默认的页面下载实现类 HttpClientDownloader 没有记录下载失败的URL ,我们继承 HttpClientDownloader 实现 onError 方法记录异常 URL 列表:
    @ThreadSafe
    @Component
    public class HttpClientDownloader2 extends HttpClientDownloader {
        
        private static List<String> errorUrls = new ArrayList<>();
    
        @Override
        protected void onError(Request request) {
            super.onError(request);
            errorUrls.add(request.getUrl());
        }
    
        public List<String> getErrorUrls() {
            return errorUrls;
        }
    
    }
    
    1. 执行抓取:
    @SpringBootTest
    class ReptileApplicationTests {
    
        @Autowired
        private SqlPipeline sqlPipeline;
        @Autowired
        private HttpClientDownloader2 httpClientDownloader2;
    
        @Test
        public void start() {
            exec("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html");
    		// 异常URL 再次尝试抓取
            final List<String> errorUrls = httpClientDownloader2.getErrorUrls();
            if (errorUrls != null && errorUrls.size() > 0) {
                for (String errorUrl : errorUrls) {
                    errorUrls.remove(errorUrl);
                    exec(errorUrl);
                }
            }
            // 二次抓取失败的异常URL,打印输出
            final List<String> errorUrls2 = httpClientDownloader2.getErrorUrls();
            for (String errorUrl : errorUrls2) {
                System.out.println(errorUrl);
            }
        }
    
        private void exec(String url) {
            List<SpiderListener> spiderListeners = new ArrayList<>();
            // 记录异常的URL
            spiderListeners.add(new ErrorLogSpiderListener());
            Spider.create(new RegionPageProcessor())
                    .addUrl(url)
                    .setDownloader(httpClientDownloader2)
                    .addPipeline(sqlPipeline).run();
    
        }
    }
    

    2.4 整理数据

    1. 去除重复数据
    -- 创建自增ID 
    ALTER TABLE `test`.`ba_region` 
    ADD COLUMN `id` bigint(20) NOT NULL AUTO_INCREMENT FIRST,
    ADD PRIMARY KEY (`id`);
    -- 删除重复数据
    delete from ba_region where id in(
        select id from (
            select max(id)as id from ba_region 
            	where region_id in(
                    	select region_id from ba_region  
                    		group by region_id HAVING count(1) > 1
                ) 
            group by region_id) a
    )
    
    1. 手动添加国家信息
    INSERT INTO `ba_region`(`region_id`, `region_name`, `region_type`, `region_Parent`, `province_id`, `province_name`, `city_id`, `city_name`, `district_id`, `district_name`, `street_id`, `street_name`, `creator`, `create_time`, `modifier`, `modify_time`, `ip_address`) VALUES ('1000', '中国', '00', '', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL);
    
    1. 更新省级数据
    update ba_region set region_Parent='1000',province_id=region_id,province_name=  region_name  where region_type='10';
    
    1. 更新市级信息
    -- 更新城市字段
    update ba_region set city_id =region_id,city_name = region_name   where region_type='20';
    -- 更新省份字段
    update ba_region a,ba_region b set a.province_id=b.region_id,a.province_name= b.region_name
    where a.region_type='20' and a.region_Parent = b.region_id   ;
    
    1. 更新区/县级信息
    -- 更新区/县字段
    update ba_region set district_id =region_id,district_name = region_name   where region_type='30';
    -- 更新省份、城市字段
    update ba_region a,ba_region b set  a.province_id=b.province_id,a.province_name= b.province_name, a.city_id=b.region_id,a.city_name= b.region_name
    where a.region_type='30' and a.region_Parent = b.region_id;
    

    6) 更新镇/街道信息

    -- 更新镇/街道字段
    update ba_region set street_id =region_id,street_name = region_name   where region_type='40';
    -- 更新省份、城市、区/县字段
    update ba_region a,ba_region b set  a.province_id=b.province_id,a.province_name= b.province_name, a.city_id=b.city_id,a.city_name= b.city_name,a.district_id=b.region_id,a.district_name= b.region_name
    where a.region_type='40' and a.region_Parent = b.region_id;       
    
    1. 删除自增字段
    ALTER TABLE `test`.`ba_region` 
    DROP COLUMN `id`,
    DROP PRIMARY KEY;
    
  • 相关阅读:
    第4月第1天 makefile automake
    第3月30天 UIImage imageWithContentsOfFile卡顿 Can't add self as subview MPMoviePlayerControlle rcrash
    第3月第27天 uitableviewcell复用
    learning uboot fstype command
    learning uboot part command
    linux command dialog
    linux command curl and sha256sum implement download verification package
    learning shell script prompt to run with superuser privileges (4)
    learning shell get script absolute path (3)
    learning shell args handing key=value example (2)
  • 原文地址:https://www.cnblogs.com/markLogZhu/p/13557933.html
Copyright © 2011-2022 走看看