一、介绍
1.1 背景
项目中地区基础数据从上线后再也没有更新过,与目前的国家地区数据不一致,需要做同步操作。
1.2 解决方案
通过采用爬虫手段从 国家统计局官网 抓取数据做同步。
采用 WebMagic 框架实现操作
二、操作
2.1 创建本地临时表
CREATE TABLE `ba_region` (
`region_id` varchar(10) COLLATE utf8mb4_bin NOT NULL COMMENT '地区代码',
`region_name` varchar(255) COLLATE utf8mb4_bin NOT NULL COMMENT '地区名称',
`region_type` varchar(2) COLLATE utf8mb4_bin NOT NULL COMMENT '地区级别',
`region_Parent` varchar(10) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '上级地区代码',
`province_id` varchar(10) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '省代码',
`province_name` varchar(255) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '省名称',
`city_id` varchar(10) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '市代码',
`city_name` varchar(255) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '市名称',
`district_id` varchar(10) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '区/县代码',
`district_name` varchar(255) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '区/县名称',
`street_id` varchar(10) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '乡/镇/街代码',
`street_name` varchar(255) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '乡/镇/街名称',
`creator` varchar(30) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '创建人员',
`create_time` datetime DEFAULT NULL COMMENT '创建时间',
`modifier` varchar(30) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '修改人员',
`modify_time` datetime DEFAULT NULL COMMENT '修改时间',
`ip_address` varchar(30) COLLATE utf8mb4_bin DEFAULT NULL COMMENT 'IP地址',
PRIMARY KEY (`id`),
KEY `inx_region_parent` (`region_Parent`) USING BTREE,
KEY `inx_region_type` (`region_type`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=47122 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin COMMENT='地区基础数据';
2.2 添加相关依赖和配置
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.mybatis.spring.boot</groupId>
<artifactId>mybatis-spring-boot-starter</artifactId>
<version>2.0.0</version>
</dependency>
<!--mysql 驱动依赖-->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.10</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>4.4.4</version>
</dependency>
</dependencies>
数据库配置:
spring:
datasource:
driver-class-name: com.mysql.cj.jdbc.Driver
url: jdbc:mysql://localhost:3306/test?serverTimezone=UTC&useUnicode=true&useSSL=false&useUnicode=true&characterEncoding=utf8
username: root
password: sa000
2.3 实现抓取逻辑
- 新增自定义页面处理组件
public class RegionPageProcessor implements PageProcessor {
private Site site = Site.me()
.setCharset("GBK")
// 设置置重试次数
.setRetryTimes(3)
// 设置循环重试次数
.setCycleRetryTimes(3)
// 设置抓取间隔
.setSleepTime(100)
// 设置超时时间,单位毫秒
.setTimeOut(10000);
private String SPIDER_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/";
/**
* 获取搜寻器配置
*
* @return
*/
@Override
public Site getSite() {
return site;
}
/**
* 处理
*
* @param page
*/
@Override
public void process(Page page) {
page.setCharset("UTF-8");
final Map<RegionType, List<Selectable>> map = groupingNode(page);
for (RegionType regionType : map.keySet()) {
final List<Selectable> selectables = map.get(regionType);
if (selectables == null || selectables.size() <= 0) {
continue;
}
List<BaRegion> regionList = conversion(page, regionType, selectables);
if (regionList.size() <= 0) {
continue;
}
page.putField("data", regionList);
}
}
private Map<RegionType, List<Selectable>> groupingNode(Page page) {
Map<RegionType, List<Selectable>> map = new HashMap<>();
addNodeGroup(map, page, PROVINCE);
addNodeGroup(map, page, RegionType.CITY);
addNodeGroup(map, page, RegionType.COUNTY);
addNodeGroup(map, page, RegionType.TOWN);
return map;
}
private void addNodeGroup(Map<RegionType, List<Selectable>> map, Page page, RegionType regionType) {
final List<Selectable> cityNodes = page.getHtml().xpath("//tr[@class='" + regionType.getClassName() + "']/td").nodes();
if (cityNodes != null && cityNodes.size() > 0) {
map.put(regionType, cityNodes);
}
}
private List<BaRegion> conversion(Page page, RegionType regionType, List<Selectable> selectables) {
List<BaRegion> result = new ArrayList<>();
String code = "";
String nodeCode = "";
for (Selectable selectable : selectables) {
final List<String> all = selectable.xpath("//*/text()").all();
if (all == null || all.size() <= 0) {
continue;
}
String nodeName = all.get(0);
if (all.size() > 1) {
nodeName = all.get(1);
}
if (StringUtils.isBlank(nodeName)) {
continue;
}
if (compile("[0-9]*").matcher(nodeName).matches()) {
code = nodeName;
continue;
}
final List<String> links = selectable.links().all();
// 部分市辖区下面没有区域
if (links.size() <= 0) {
if (StringUtils.isBlank(code)) {
continue;
}
if (RegionType.COUNTY.equals(regionType)) {
nodeCode = (code.substring(0, 2) + "/" + code.substring(2, 4) + "/" + code.substring(0, 6));
}
} else {
// 获取下一级 URL
String newUrl = links.get(0);
boolean nextLevelSwitch = !RegionType.TOWN.equals(regionType) && StringUtils.isNotBlank(newUrl);
if (nextLevelSwitch) {
page.addTargetRequest(newUrl);
}
nodeCode = newUrl.replace(SPIDER_URL, "").replace(".html", "");
}
if (StringUtils.isBlank(nodeCode)) {
continue;
}
// 组装数据
BaRegion baRegion = toBaRegion(regionType, nodeCode, nodeName);
result.add(baRegion);
}
return result;
}
private BaRegion toBaRegion(RegionType regionType, String code, String nodeName) {
BaRegion region = new BaRegion();
region.setRegionType(regionType.getCode());
region.setRegionName(nodeName);
String[] split;
String parentId;
String regionId;
switch (regionType) {
case PROVINCE:
region.setRegionId(code + "0000");
region.setRegionParent("1000");
break;
case CITY:
split = code.split("/");
parentId = split[0] + "0000";
regionId = split[split.length - 1] + "00";
region.setRegionId(regionId);
region.setRegionParent(parentId);
break;
case COUNTY:
split = code.split("/");
parentId = split[0] + split[1] + "00";
regionId = split[2];
region.setRegionId(regionId);
region.setRegionParent(parentId);
break;
case TOWN:
split = code.split("/");
if (split.length == 3) {// 针对上一级直接是城市的地区
parentId = split[0] + split[1] + "00";
regionId = split[2];
} else {
parentId = split[0] + split[1] + split[2];
regionId = split[3];
}
region.setRegionId(regionId);
region.setRegionParent(parentId);
break;
default:
break;
}
return region;
}
}
涉及枚举类:
public enum RegionType {
PROVINCE("10", "provincetr", "省份"),
CITY("20", "citytr", "城市"),
COUNTY("30", "countytr", "区/县"),
TOWN("40", "towntr", "镇/街道"),
VILLAGE("50", "villagetr", "居委会"),
;
private String code;
private String className;
private String desc;
RegionType(String code, String className, String desc) {
this.code = code;
this.className = className;
this.desc = desc;
}
public String getCode() {
return code;
}
public String getDesc() {
return desc;
}
public String getClassName() {
return className;
}
}
- 实现结果持久化到数据库
@Component
public class SqlPipeline implements Pipeline {
@Autowired
private BaRegionMapper baRegionMapper;
@Override
public void process(ResultItems resultItems, Task task) {
List<BaRegion> baRegionList = resultItems.get("data");
if (baRegionList == null || baRegionList.size() <= 0) {
System.out.println("抓取数据不存在");
return;
}
for (BaRegion baRegion : baRegionList) {
baRegionMapper.add(baRegion);
}
}
}
DAO 实现如下:
@Mapper
public interface BaRegionMapper {
@Insert("INSERT INTO ba_region VALUES(#{regionId},#{regionName},#{regionType},#{regionParent}," +
"#{provinceId},#{provinceName},#{cityId},#{cityName},#{districtId},#{districtName},#{streetId}," +
"#{streetName},#{creator},#{createTime},#{modifier},#{modifyTime},#{ipAddress})")
int add(BaRegion region);
}
- 默认的页面下载实现类 HttpClientDownloader 没有记录下载失败的URL ,我们继承 HttpClientDownloader 实现 onError 方法记录异常 URL 列表:
@ThreadSafe
@Component
public class HttpClientDownloader2 extends HttpClientDownloader {
private static List<String> errorUrls = new ArrayList<>();
@Override
protected void onError(Request request) {
super.onError(request);
errorUrls.add(request.getUrl());
}
public List<String> getErrorUrls() {
return errorUrls;
}
}
- 执行抓取:
@SpringBootTest
class ReptileApplicationTests {
@Autowired
private SqlPipeline sqlPipeline;
@Autowired
private HttpClientDownloader2 httpClientDownloader2;
@Test
public void start() {
exec("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html");
// 异常URL 再次尝试抓取
final List<String> errorUrls = httpClientDownloader2.getErrorUrls();
if (errorUrls != null && errorUrls.size() > 0) {
for (String errorUrl : errorUrls) {
errorUrls.remove(errorUrl);
exec(errorUrl);
}
}
// 二次抓取失败的异常URL,打印输出
final List<String> errorUrls2 = httpClientDownloader2.getErrorUrls();
for (String errorUrl : errorUrls2) {
System.out.println(errorUrl);
}
}
private void exec(String url) {
List<SpiderListener> spiderListeners = new ArrayList<>();
// 记录异常的URL
spiderListeners.add(new ErrorLogSpiderListener());
Spider.create(new RegionPageProcessor())
.addUrl(url)
.setDownloader(httpClientDownloader2)
.addPipeline(sqlPipeline).run();
}
}
2.4 整理数据
- 去除重复数据
-- 创建自增ID
ALTER TABLE `test`.`ba_region`
ADD COLUMN `id` bigint(20) NOT NULL AUTO_INCREMENT FIRST,
ADD PRIMARY KEY (`id`);
-- 删除重复数据
delete from ba_region where id in(
select id from (
select max(id)as id from ba_region
where region_id in(
select region_id from ba_region
group by region_id HAVING count(1) > 1
)
group by region_id) a
)
- 手动添加国家信息
INSERT INTO `ba_region`(`region_id`, `region_name`, `region_type`, `region_Parent`, `province_id`, `province_name`, `city_id`, `city_name`, `district_id`, `district_name`, `street_id`, `street_name`, `creator`, `create_time`, `modifier`, `modify_time`, `ip_address`) VALUES ('1000', '中国', '00', '', NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL);
- 更新省级数据
update ba_region set region_Parent='1000',province_id=region_id,province_name= region_name where region_type='10';
- 更新市级信息
-- 更新城市字段
update ba_region set city_id =region_id,city_name = region_name where region_type='20';
-- 更新省份字段
update ba_region a,ba_region b set a.province_id=b.region_id,a.province_name= b.region_name
where a.region_type='20' and a.region_Parent = b.region_id ;
- 更新区/县级信息
-- 更新区/县字段
update ba_region set district_id =region_id,district_name = region_name where region_type='30';
-- 更新省份、城市字段
update ba_region a,ba_region b set a.province_id=b.province_id,a.province_name= b.province_name, a.city_id=b.region_id,a.city_name= b.region_name
where a.region_type='30' and a.region_Parent = b.region_id;
6) 更新镇/街道信息
-- 更新镇/街道字段
update ba_region set street_id =region_id,street_name = region_name where region_type='40';
-- 更新省份、城市、区/县字段
update ba_region a,ba_region b set a.province_id=b.province_id,a.province_name= b.province_name, a.city_id=b.city_id,a.city_name= b.city_name,a.district_id=b.region_id,a.district_name= b.region_name
where a.region_type='40' and a.region_Parent = b.region_id;
- 删除自增字段
ALTER TABLE `test`.`ba_region`
DROP COLUMN `id`,
DROP PRIMARY KEY;