zoukankan      html  css  js  c++  java
  • 抓取国家地区基础数据

    一、介绍

    1.1 背景

    项目中地区基础数据从上线后再也没有更新过,与目前的国家地区数据不一致,需要做同步操作。

    1.2 解决方案

    通过采用爬虫手段从 国家统计局官网 抓取数据做同步。

    采用 WebMagic 框架实现操作

    二、操作

    2.1 创建本地临时表

    CREATE TABLE `ba_region` (  
      `region_id` varchar(10) COLLATE utf8mb4_bin NOT NULL COMMENT '地区代码',
      `region_name` varchar(255) COLLATE utf8mb4_bin NOT NULL COMMENT '地区名称',
      `region_type` varchar(2) COLLATE utf8mb4_bin NOT NULL COMMENT '地区级别',
      `region_Parent` varchar(10) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '上级地区代码',
      `province_id` varchar(10) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '省代码',
      `province_name` varchar(255) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '省名称',
      `city_id` varchar(10) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '市代码',
      `city_name` varchar(255) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '市名称',
      `district_id` varchar(10) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '区/县代码',
      `district_name` varchar(255) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '区/县名称',
      `street_id` varchar(10) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '乡/镇/街代码',
      `street_name` varchar(255) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '乡/镇/街名称',
      `creator` varchar(30) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '创建人员',
      `create_time` datetime DEFAULT NULL COMMENT '创建时间',
      `modifier` varchar(30) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '修改人员',
      `modify_time` datetime DEFAULT NULL COMMENT '修改时间',
      `ip_address` varchar(30) COLLATE utf8mb4_bin DEFAULT NULL COMMENT 'IP地址',
      PRIMARY KEY (`id`),
      KEY `inx_region_parent` (`region_Parent`) USING BTREE,
      KEY `inx_region_type` (`region_type`) USING BTREE
    ) ENGINE=InnoDB AUTO_INCREMENT=47122 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin COMMENT='地区基础数据';
    

    2.2 添加相关依赖和配置

    <dependencies>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-test</artifactId>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.7.3</version>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.7.3</version>
            <exclusions>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>slf4j-log4j12</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
    
        <dependency>
            <groupId>org.mybatis.spring.boot</groupId>
            <artifactId>mybatis-spring-boot-starter</artifactId>
            <version>2.0.0</version>
        </dependency>
    
        <!--mysql 驱动依赖-->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <scope>runtime</scope>
        </dependency>
        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
            <version>1.18.10</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
           <groupId>org.apache.httpcomponents</groupId>
           <artifactId>httpclient</artifactId>
           <version>4.5.2</version>
       </dependency>
       <dependency>
           <groupId>org.apache.httpcomponents</groupId>
           <artifactId>httpcore</artifactId>
           <version>4.4.4</version>
       </dependency>
    </dependencies>
    

    数据库配置:

    spring:
      datasource:
        driver-class-name: com.mysql.cj.jdbc.Driver
        url: jdbc:mysql://localhost:3306/test?serverTimezone=UTC&useUnicode=true&useSSL=false&useUnicode=true&characterEncoding=utf8
        username: root
        password: sa000
    

    2.3 实现抓取逻辑

    1. 新增自定义页面处理组件
    public class RegionPageProcessor implements PageProcessor {
    
        private Site site = Site.me()
                .setCharset("GBK")
                // 设置置重试次数
                .setRetryTimes(3)
                // 设置循环重试次数
                .setCycleRetryTimes(3)
                // 设置抓取间隔
                .setSleepTime(100)
                // 设置超时时间,单位毫秒
                .setTimeOut(10000);
    
        private String SPIDER_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/";
    
    
        /**
         * 获取搜寻器配置
         *
         * @return
         */
        @Override
        public Site getSite() {
            return site;
        }
    
        /**
         * 处理
         *
         * @param page
         */
        @Override
        public void process(Page page) {
            page.setCharset("UTF-8");
    
            final Map<RegionType, List<Selectable>> map = groupingNode(page);
            for (RegionType regionType : map.keySet()) {
                final List<Selectable> selectables = map.get(regionType);
                if (selectables == null || selectables.size() <= 0) {
                    continue;
                }
    
                List<BaRegion> regionList = conversion(page, regionType, selectables);
                if (regionList.size() <= 0) {
                    continue;
                }
                page.putField("data", regionList);
            }
    
        }
    
    
        private Map<RegionType, List<Selectable>> groupingNode(Page page) {
            Map<RegionType, List<Selectable>> map = new HashMap<>();
            addNodeGroup(map, page, PROVINCE);
            addNodeGroup(map, page, RegionType.CITY);
            addNodeGroup(map, page, RegionType.COUNTY);
            addNodeGroup(map, page, RegionType.TOWN);
            return map;
        }
    
        private void addNodeGroup(Map<RegionType, List<Selectable>> map, Page page, RegionType regionType) {
            final List<Selectable> cityNodes = page.getHtml().xpath("//tr[@class='" + regionType.getClassName() + "']/td").nodes();
            if (cityNodes != null && cityNodes.size() > 0) {
                map.put(regionType, cityNodes);
            }
        }
    
    
        private List<BaRegion> conversion(Page page, RegionType regionType, List<Selectable> selectables) {
            List<BaRegion> result = new ArrayList<>();
            String code = "";
            String nodeCode = "";
            for (Selectable selectable : selectables) {
                final List<String> all = selectable.xpath("//*/text()").all();
                if (all == null || all.size() <= 0) {
                    continue;
                }
                String nodeName = all.get(0);
                if (all.size() > 1) {
                    nodeName = all.get(1);
                }
    
                if (StringUtils.isBlank(nodeName)) {
                    continue;
                }
                if (compile("[0-9]*").matcher(nodeName).matches()) {
                    code = nodeName;
                    continue;
                }
                final List<String> links = selectable.links().all();
                // 部分市辖区下面没有区域
                if (links.size() <= 0) {
                    if (StringUtils.isBlank(code)) {
                        continue;
                    }
                    if (RegionType.COUNTY.equals(regionType)) {
                        nodeCode = (code.substring(0, 2) + "/" + code.substring(2, 4) + "/" + code.substring(0, 6));
                    }
    
                } else {
                    // 获取下一级 URL
                    String newUrl = links.get(0);
                    boolean nextLevelSwitch = !RegionType.TOWN.equals(regionType) && StringUtils.isNotBlank(newUrl);
                    if (nextLevelSwitch) {
                        page.addTargetRequest(newUrl);
                    }
                    nodeCode = newUrl.replace(SPIDER_URL, "").replace(".html", "");
    
                }
                if (StringUtils.isBlank(nodeCode)) {
                    continue;
                }
                // 组装数据
                BaRegion baRegion = toBaRegion(regionType, nodeCode, nodeName);
                result.add(baRegion);
            }
            return result;
        }
    
        private BaRegion toBaRegion(RegionType regionType, String code, String nodeName) {
            BaRegion region = new BaRegion();
            region.setRegionType(regionType.getCode());
            region.setRegionName(nodeName);
    
            String[] split;
            String parentId;
            String regionId;
    
            switch (regionType) {
                case PROVINCE:
                    region.setRegionId(code + "0000");
                    region.setRegionParent("1000");
                    break;
                case CITY:
                    split = code.split("/");
                    parentId = split[0] + "0000";
                    regionId = split[split.length - 1] + "00";
                    region.setRegionId(regionId);
                    region.setRegionParent(parentId);
                    break;
                case COUNTY:
                    split = code.split("/");
                    parentId = split[0] + split[1] + "00";
                    regionId = split[2];
                    region.setRegionId(regionId);
                    region.setRegionParent(parentId);
                    break;
                case TOWN:
                    split = code.split("/");
                    if (split.length == 3) {// 针对上一级直接是城市的地区
                        parentId = split[0] + split[1] + "00";
                        regionId = split[2];
                    } else {
                        parentId = split[0] + split[1] + split[2];
                        regionId = split[3];
                    }
                    region.setRegionId(regionId);
                    region.setRegionParent(parentId);
                    break;
                default:
                    break;
            }
            return region;
        }
    
    
    }
    

    涉及枚举类:

    public enum RegionType {
        PROVINCE("10", "provincetr", "省份"),
        CITY("20", "citytr", "城市"),
        COUNTY("30", "countytr", "区/县"),
        TOWN("40", "towntr", "镇/街道"),
        VILLAGE("50", "villagetr", "居委会"),
        ;
    
        private String code;
        private String className;
        private String desc;
    
        RegionType(String code, String className, String desc) {
            this.code = code;
            this.className = className;
            this.desc = desc;
        }
    
        public String getCode() {
            return code;
        }
    
        public String getDesc() {
            return desc;
        }
    
        public String getClassName() {
            return className;
        }
    }
    
    
    1. 实现结果持久化到数据库
    @Component
    public class SqlPipeline implements Pipeline {
    
        @Autowired
        private BaRegionMapper baRegionMapper;
    
        @Override
        public void process(ResultItems resultItems, Task task) {
            List<BaRegion> baRegionList = resultItems.get("data");
            if (baRegionList == null || baRegionList.size() <= 0) {
                System.out.println("抓取数据不存在");
                return;
            }
            for (BaRegion baRegion : baRegionList) {
                baRegionMapper.add(baRegion);
            }
        }
    }
    

    DAO 实现如下:

    @Mapper
    public interface BaRegionMapper {
        @Insert("INSERT INTO ba_region VALUES(#{regionId},#{regionName},#{regionType},#{regionParent}," +
                "#{provinceId},#{provinceName},#{cityId},#{cityName},#{districtId},#{districtName},#{streetId}," +
                "#{streetName},#{creator},#{createTime},#{modifier},#{modifyTime},#{ipAddress})")
        int add(BaRegion region);
    }
    
    1. 默认的页面下载实现类 HttpClientDownloader 没有记录下载失败的URL ,我们继承 HttpClientDownloader 实现 onError 方法记录异常 URL 列表:
    @ThreadSafe
    @Component
    public class HttpClientDownloader2 extends HttpClientDownloader {
        
        private static List<String> errorUrls = new ArrayList<>();
    
        @Override
        protected void onError(Request request) {
            super.onError(request);
            errorUrls.add(request.getUrl());
        }
    
        public List<String> getErrorUrls() {
            return errorUrls;
        }
    
    }
    
    1. 执行抓取:
    @SpringBootTest
    class ReptileApplicationTests {
    
        @Autowired
        private SqlPipeline sqlPipeline;
        @Autowired
        private HttpClientDownloader2 httpClientDownloader2;
    
        @Test
        public void start() {
            exec("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html");
    		// 异常URL 再次尝试抓取
            final List<String> errorUrls = httpClientDownloader2.getErrorUrls();
            if (errorUrls != null && errorUrls.size() > 0) {
                for (String errorUrl : errorUrls) {
                    errorUrls.remove(errorUrl);
                    exec(errorUrl);
                }
            }
            // 二次抓取失败的异常URL,打印输出
            final List<String> errorUrls2 = httpClientDownloader2.getErrorUrls();
            for (String errorUrl : errorUrls2) {
                System.out.println(errorUrl);
            }
        }
    
        private void exec(String url) {
            List<SpiderListener> spiderListeners = new ArrayList<>();
            // 记录异常的URL
            spiderListeners.add(new ErrorLogSpiderListener());
            Spider.create(new RegionPageProcessor())
                    .addUrl(url)
                    .setDownloader(httpClientDownloader2)
                    .addPipeline(sqlPipeline).run();
    
        }
    }
    

    2.4 整理数据

    1. 去除重复数据
    -- 创建自增ID 
    ALTER TABLE `test`.`ba_region` 
    ADD COLUMN `id` bigint(20) NOT NULL AUTO_INCREMENT FIRST,
    ADD PRIMARY KEY (`id`);
    -- 删除重复数据
    delete from ba_region where id in(
        select id from (
            select max(id)as id from ba_region 
            	where region_id in(
                    	select region_id from ba_region  
                    		group by region_id HAVING count(1) > 1
                ) 
            group by region_id) a
    )
    
    1. 手动添加国家信息
    INSERT INTO `ba_region`(`region_id`, `region_name`, `region_type`, `region_Parent`, `province_id`, `province_name`, `city_id`, `city_name`, `district_id`, `district_name`, `street_id`, `street_name`, `creator`, `create_time`, `modifier`, `modify_time`, `ip_address`) VALUES ('1000', '中国', '00', '', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL);
    
    1. 更新省级数据
    update ba_region set region_Parent='1000',province_id=region_id,province_name=  region_name  where region_type='10';
    
    1. 更新市级信息
    -- 更新城市字段
    update ba_region set city_id =region_id,city_name = region_name   where region_type='20';
    -- 更新省份字段
    update ba_region a,ba_region b set a.province_id=b.region_id,a.province_name= b.region_name
    where a.region_type='20' and a.region_Parent = b.region_id   ;
    
    1. 更新区/县级信息
    -- 更新区/县字段
    update ba_region set district_id =region_id,district_name = region_name   where region_type='30';
    -- 更新省份、城市字段
    update ba_region a,ba_region b set  a.province_id=b.province_id,a.province_name= b.province_name, a.city_id=b.region_id,a.city_name= b.region_name
    where a.region_type='30' and a.region_Parent = b.region_id;
    

    6) 更新镇/街道信息

    -- 更新镇/街道字段
    update ba_region set street_id =region_id,street_name = region_name   where region_type='40';
    -- 更新省份、城市、区/县字段
    update ba_region a,ba_region b set  a.province_id=b.province_id,a.province_name= b.province_name, a.city_id=b.city_id,a.city_name= b.city_name,a.district_id=b.region_id,a.district_name= b.region_name
    where a.region_type='40' and a.region_Parent = b.region_id;       
    
    1. 删除自增字段
    ALTER TABLE `test`.`ba_region` 
    DROP COLUMN `id`,
    DROP PRIMARY KEY;
    
  • 相关阅读:
    put 创建索引
    post创建一个空索引
    elasticsearch 安装marvel
    elasticsearch 安装bigdesk插件
    perl 播放windows文件
    如何在十分钟内插入1亿条记录到Oracle数据库?
    常见地址说明
    如何做出实用而强大的数据地图?
    如何利用jqGrid表格方法重新设置caption属性值
    如何给jqGrid表格按照字段进行分组
  • 原文地址:https://www.cnblogs.com/markLogZhu/p/13557933.html
Copyright © 2011-2022 走看看