zoukankan      html  css  js  c++  java
  • hive新增字段和修改字段的影响

    DROP TABLE IF EXISTS tmp_dm_test_a.t_aa_orc;
    USE tmp_dm_test_a;
    CREATE EXTERNAL TABLE IF NOT EXISTS tmp_dm_test_a.t_aa_orc(
     user_id          string COMMENT '用户id'
    ,all_addr  string COMMENT '常用地址'
    )
    PARTITIONED BY (
      inc_day string COMMENT 'inc_day used by partition'
    )
    STORED AS orc
    TBLPROPERTIES('orc.compress'='SNAPPY');
    
    
    
    set hive.exec.dynamic.partition=true;
    set hive.exec.dynamic.partition.mode=nonstrict;
    set hive.fetch.task.conversion=more;
    set hive.exec.parallel=true;
    set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.SnappyCodec;
    set mapreduce.output.fileoutputformat.compress.type=BLOCK;
    
    WITH tmp AS
    (
    SELECT 'sf1111' as user_id, '湖南省' as all_addr, '20180101'
    union all
    SELECT 'sf2222' as user_id, '江西省' as all_addr, '20180101'
    union all
    SELECT 'sf3333' as user_id, '上东省' as all_addr, '20180101'
    union all
    SELECT 'sf1111' as user_id, '湖南省' as all_addr, '20180102'
    union all
    SELECT 'sf2222' as user_id, '江西省' as all_addr, '20180102'
    union all
    SELECT 'sf3333' as user_id, '上东省' as all_addr, '20180102'
    )
    
    
    INSERT OVERWRITE TABLE tmp_dm_test_a.t_aa_orc PARTITION (inc_day)
    SELECT * from tmp;
    
    SELECT * from tmp_dm_test_a.t_aa_orc;
    SELECT user_id,all_addr,original_union_id from tmp_dm_test_a.t_aa_orc;
    SELECT user_id,all_addr,original_union_id from tmp_dm_test_a.t_aa_orc where inc_day='20180101';
    SELECT user_id,all_addr,original_union_id from tmp_dm_test_a.t_aa_orc where inc_day='20180103';
    ----
    SELECT phone_number,all_addr,original_union_id from tmp_dm_test_a.t_aa_orc;
    
    --DDL语句最后添加CASCADE,否则新增的列在旧分区中不可见
     alter table tmp_dm_test_a.t_aa_orc add columns(original_union_id string) cascade; 
    --新增多个字段

    alter table `ods_wst`.`awd_pckt_in_sm`
    add columns(
    `rule_code` string COMMENT '规则编码'
    , `bus_type` string COMMENT '扩展字段业务类型,用于关联扩展字段业务值1-4'
    , `bus_attr1` string COMMENT '扩展字段业务值1'
    , `bus_attr2` string COMMENT '扩展字段业务值2'
    , `bus_attr3` string COMMENT '扩展字段业务值3'
    , `bus_attr4` string COMMENT '扩展字段业务值4'
    ) cascade;

     
    alter table tmp_dm_test_a.t_aa_orc partition(inc_day='20180101') add columns(original_union_id string); DROP TABLE IF EXISTS tmp_dm_test_a.t_aa_orc; USE tmp_dm_test_a; CREATE EXTERNAL TABLE IF NOT EXISTS tmp_dm_test_a.t_aa_orc( user_id string COMMENT '用户id' ,all_addr string COMMENT '常用地址' ,original_union_id string ) PARTITIONED BY ( inc_day string COMMENT 'inc_day used by partition' ) STORED AS orc TBLPROPERTIES('orc.compress'='SNAPPY'); MSCK REPAIR TABLE tmp_dm_test_a.t_aa_orc; WITH tmp AS ( SELECT 'sf1111' as user_id, '湖南省' as all_addr,'sf0x1111' as original_union_id, '20180103' union all SELECT 'sf2222' as user_id, '江西省' as all_addr,'sf0x2211' as original_union_id, '20180103' union all SELECT 'sf3333' as user_id, '上东省' as all_addr,'sf0x3311' as original_union_id, '20180103' union all SELECT 'sf1111' as user_id, '湖南省' as all_addr,'sf0x4411' as original_union_id, '20180104' union all SELECT 'sf2222' as user_id, '江西省' as all_addr,'sf0x5511' as original_union_id, '20180104' union all SELECT 'sf3333' as user_id, '上东省' as all_addr,'sf0x6611' as original_union_id, '20180104' ) INSERT OVERWRITE TABLE tmp_dm_test_a.t_aa_orc PARTITION (inc_day) SELECT * from tmp; ******************** 之前分区的数据找不到phone_number,需要重跑历史数据
     alter table tmp_dm_test_a.t_aa_orc change column user_id phone_number string; 

    alter table tmp_dm_test_a.t_aa_orc change column user_id phone_number string cascade;
    ------------- show create table tmp_dm_test_a.t_aa_orc;
    ALTER TABLE table_name 
      [PARTITION partition_spec]                 -- (Note: Hive 0.14.0 and later)
      ADD|REPLACE COLUMNS (col_name data_type [COMMENT col_comment], ...)
      [CASCADE|RESTRICT]                         -- (Note: Hive 1.1.0 and later)

    REPLACE列删除所有现有列并添加新的列集。这只能用于具有本机SerDe的表(DynamicSerDe、元数据类型pedcolumnsetserde、LazySimpleSerDe和ColumnarSerDe)。REPLACE列还可以用于删除列。


    删除列示例:

    原有Hive表test_change中有a,b,c,d,e这几个字段
       将从test_change中删除“d”列:
         ALTER TABLE test_change REPLACE COLUMNS (a int, b int,c string,e string) cascade;
       将d和e两列一起删除:
          ALTER TABLE test_change REPLACE COLUMNS (a int, b int,c string) cascade;
    -- parque格式的数据,保留d,e字段的数据,但是replace删除后无法查询d,e
    -- REPLACE也可以调整字段的顺序,原始数据不用变也可正常查询

    parquet存储格式

    DROP TABLE IF EXISTS tmp_dm_test_a.t_aa;
    USE tmp_dm_test_a;
    CREATE EXTERNAL TABLE IF NOT EXISTS tmp_dm_test_a.t_aa(
     user_id          string COMMENT '用户id'
    ,all_addr  string COMMENT '常用地址'
    )
    PARTITIONED BY (
      inc_day string COMMENT 'inc_day used by partition'
    )
    STORED AS parquet
    TBLPROPERTIES('parquet.compression'='SNAPPY');
    
    
    
    set hive.exec.dynamic.partition=true;
    set hive.exec.dynamic.partition.mode=nonstrict;
    set hive.fetch.task.conversion=more;
    set hive.exec.parallel=true;
    set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.SnappyCodec;
    set mapreduce.output.fileoutputformat.compress.type=BLOCK;
    
    WITH tmp AS
    (
    SELECT 'sf1111' as user_id, '湖南省' as all_addr, '20180101'
    union all
    SELECT 'sf2222' as user_id, '江西省' as all_addr, '20180101'
    union all
    SELECT 'sf3333' as user_id, '上东省' as all_addr, '20180101'
    union all
    SELECT 'sf1111' as user_id, '湖南省' as all_addr, '20180102'
    union all
    SELECT 'sf2222' as user_id, '江西省' as all_addr, '20180102'
    union all
    SELECT 'sf3333' as user_id, '上东省' as all_addr, '20180102'
    )
    
    
    INSERT OVERWRITE TABLE tmp_dm_test_a.t_aa PARTITION (inc_day)
    SELECT * from tmp;
    
    SELECT * from tmp_dm_test_a.t_aa;
    SELECT user_id,all_addr,original_union_id from tmp_dm_test_a.t_aa;
    SELECT user_id,all_addr,original_union_id from tmp_dm_test_a.t_aa where inc_day='20180101';
    ----
    SELECT phone_number,all_addr,original_union_id from tmp_dm_test_a.t_aa;
    
    
     alter table tmp_dm_test_a.t_aa add columns(original_union_id string); 
     
     alter table tmp_dm_test_a.t_aa partition(inc_day='20180101') add columns(original_union_id string); 
     
     
     
     
    DROP TABLE IF EXISTS tmp_dm_test_a.t_aa;
    USE tmp_dm_test_a;
    CREATE EXTERNAL TABLE IF NOT EXISTS tmp_dm_test_a.t_aa(
     user_id          string COMMENT '用户id'
    ,all_addr  string COMMENT '常用地址'
    ,original_union_id string
    )
    PARTITIONED BY (
      inc_day string COMMENT 'inc_day used by partition'
    )
    STORED AS parquet
    TBLPROPERTIES('parquet.compression'='SNAPPY');
    
    
    MSCK REPAIR TABLE tmp_dm_test_a.t_aa;
    
    WITH tmp AS
    (
    SELECT 'sf1111' as user_id, '湖南省' as all_addr,'sf0x1111' as original_union_id, '20180103'
    union all                                                                       
    SELECT 'sf2222' as user_id, '江西省' as all_addr,'sf0x2211' as original_union_id, '20180103'
    union all                                                                       
    SELECT 'sf3333' as user_id, '上东省' as all_addr,'sf0x3311' as original_union_id, '20180103'
    union all                                                                       
    SELECT 'sf1111' as user_id, '湖南省' as all_addr,'sf0x4411' as original_union_id, '20180104'
    union all                                                                       
    SELECT 'sf2222' as user_id, '江西省' as all_addr,'sf0x5511' as original_union_id, '20180104'
    union all                                                                       
    SELECT 'sf3333' as user_id, '上东省' as all_addr,'sf0x6611' as original_union_id, '20180104'
    )
    
    
    INSERT OVERWRITE TABLE tmp_dm_test_a.t_aa PARTITION (inc_day)
    SELECT * from tmp;
    
    
    ********************
    alter table tmp_dm_test_a.t_aa change column user_id phone_number string;
    alter table tmp_dm_test_a.t_aa change column user_id phone_number string cascade;

    *********************************************

    结论:

    1、parquet和orc格式,旧分区中数据文件内容不可变。

    2、parquet和orc格式:字段增加后,旧数据文件中无新字段内容;新产生的分区中数据文件才会有新字段内容。

    3、parquet和orc格式:通过add语句末尾追加新增字段后,旧分区和新分区都可以查,旧数据为null而已。

    4、parquet格式:修改字段名后,无法从旧数据解析原字段内容,相当于新旧字段名没有印射关系。因为旧数据中只有旧字段名没有新字段名,而且新字段名无法印射旧字段名,所以select不能解析新字段名。

    5、orc格式:修改字段名后,可以从旧数据解析原字段内容,相当于新旧字段名有印射关系。虽然旧数据中只有旧字段名没有新字段名,但是新字段名印射了旧字段名,相当于一个别名,所以select可以解析新字段名。

  • 相关阅读:
    外设简述
    代C语言上机实践
    css动画效果
    css滑动门原理
    css整理
    html
    html单词
    倒计时.js
    随机方块
    求字符串出现次数和最大值
  • 原文地址:https://www.cnblogs.com/LIAOBO/p/13896458.html
Copyright © 2011-2022 走看看