1、创建分区表
CREATE TABLE ods_lj_hse_list_pr_ljj (
id STRING COMMENT 'uuid',
LJ_AREA_ORIG string comment '行政区县_原始',
LJ_COMM_ID_ORIG string comment '小区ID',
LJ_COMM_NAME_ORIG string comment '小区名_原始' ,
LJ_ALIAS_NAME_ORIG string comment '小区别名_原始',
LJ_COMM_ADDR_ORIG string comment '小区地址_原始',
GET_METHOD string comment '获取方式',
LJ_TITLE string comment '链家房源标题',
LJ_HSE_URL string comment '链家房源URL地址' ,
LJ_HSE_LIST_PR string comment '链家房源挂牌价' ,
LJ_HSE_SQ_LIST_PR string comment '链家房源挂牌单价' ,
LJ_HSE_TYPE string comment '链家房源户型' ,
LJ_HSE_AREA string comment '链家房源面积' ,
LJ_HSE_TWD string comment '链家房源朝向' ,
LJ_HSE_DEG_DECT string comment '链家房源装修程度' ,
LJ_HSE_FLR_TYPE string comment '链家房源楼层' ,
LJ_HSE_TTL_FLR string comment '链家房源总楼层' ,
LJ_HSE_BLD_YEAR string comment '链家房源建成年份' ,
LJ_HSE_BLD_STRU string comment '链家房源建筑结构' ,
LJ_HSE_CRAWL_TM TIMESTAMP comment '链家房源爬取时间' ,
LJ_HSE_LOC string comment '链家房源位置' ,
PK_ID int comment '唯一id' ,
BATCH_ID string comment '批次号'
) partitioned by ( LJ_PROV STRING COMMENT '省', LJ_CITY_ORIG STRING COMMENT '市_原始') row format delimited fields terminated by ",";
CREATE TABLE source_city_trans_pr_date_creator_temp (
id STRING COMMENT 'uuid',
LJ_COMM_ID_ORIG STRING COMMENT '小区ID',
LJ_COMM_NAME_ORIG STRING COMMENT '小区名_原始',
LJ_ALIAS_NAME_ORIG STRING COMMENT '小区别名_原始',
LJ_COMM_ADDR_ORIG STRING COMMENT '小区地址_原始',
GET_METHOD STRING COMMENT '获取方式',
LJ_TITLE STRING COMMENT '链家房源标题',
LJ_HSE_URL STRING COMMENT '链家房源URL地址',
LJ_HSE_LIST_PR int COMMENT '链家房源挂牌价',
LJ_HSE_SQ_LIST_PR float COMMENT '链家房源挂牌单价',
LJ_HSE_TYPE STRING COMMENT '链家房源户型',
LJ_HSE_AREA float COMMENT '链家房源面积',
LJ_HSE_TWD STRING COMMENT '链家房源朝向',
LJ_HSE_DEG_DECT STRING COMMENT '链家房源装修程度',
LJ_HSE_FLR_TYPE STRING COMMENT '链家房源楼层',
LJ_HSE_TTL_FLR int COMMENT '链家房源总楼层',
LJ_HSE_BLD_YEAR int COMMENT '链家房源建成年份',
LJ_HSE_BLD_STRU STRING COMMENT '链家房源建筑结构',
LJ_HSE_TRANS_CYCLE STRING COMMENT '链家房源成交周期',
LJ_HSE_TRANS_PR FLOAT COMMENT '链家房源成交价',
LJ_PER_SQ_TRANS_PR FLOAT COMMENT '链家房源成交均价',
LJ_HSE_TRANS_DATE TIMESTAMP COMMENT '链家房源成交日期',
LJ_HSE_CRAWL_TM TIMESTAMP COMMENT '链家房源爬取时间',
PK_ID int COMMENT '唯一主键',
BATCH_ID INT COMMENT '批次号'
) partitioned by ( LJ_PROV STRING COMMENT '省', LJ_CITY_ORIG STRING COMMENT '市_原始',
LJ_AREA_ORIG STRING COMMENT '行政区县_原始') row format delimited fields terminated by ",";
2、插入数据
insert into source_city_trans_pr_date_creator_temp select uuid(),lj_comm_id_orig, lj_comm_name_orig,
lj_alias_name_orig,
lj_comm_addr_orig, get_method, lj_title, lj_hse_url, lj_hse_list_pr, lj_hse_sq_list_pr, lj_hse_type, lj_hse_area, lj_hse_twd,
lj_hse_deg_dect, lj_hse_flr_type, lj_hse_ttl_flr, lj_hse_bld_year, lj_hse_bld_stru,lj_hse_trans_cycle ,lj_hse_trans_pr ,lj_per_sq_trans_pr,
lj_hse_trans_date, lj_hse_crawl_tm,
pk_id,
batch_id,
lj_province,
lj_city_orig,
lj_area_orig from source_city_trans_pr_date_creator;
3、设置分区大小
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions.pernode=10000;
set hive.exec.max.dynamic.partitions=100000;
4、修改数据
insert into table test SELECT * FROM source_city_list_pr_date_creator_temp WHERE lj_hse_ttl_flr is not NULL
5、其他操作
①从csv导入数据
load data inpath '/user/Linjj/test11.csv' into table source_city_list_pr_date_creator;
②查询
select * from ods_lj_tran_pr_crawl;
③清空表
truncate table source_city_list_pr_date_creator;
④更改字段数据类型
Alter table source_city_list_pr_date_creator change column batch_id batch_id string;
⑤删除表
drop table source_city_trans_pr_date_creator_temp;
⑥复制表结构并插入数据
create table if not exists ods_data_collection.ods_lj_tran_pr_crawl like test_spider.source_city_trans_pr_date_creator_temp;
insert into ods_data_collection.ods_lj_tran_pr_crawl select * from test_spider.source_city_trans_pr_date_creator_temp;