zoukankan      html  css  js  c++  java
  • hive建库,建表,删表,删库,添加静或动态分区

    create database if not exists bisys comment 'bisys系统库' location '/user/hive/warehouse/bisys.db' with dbproperties ('creator'='hadoop','date'='2019-12-26');
    create database if not exists bidwh comment 'bidwh系统库' location '/user/hive/warehouse/bidwh.db' with dbproperties ('creator'='hadoop','date'='2019-06-21');
    create database if not exists biolap comment 'biolap系统库' location '/user/hive/warehouse/biolap.db' with dbproperties ('creator'='hadoop','date'='2019-06-24');
    create database if not exists biolap comment 'biolap系统库' location '/user/wdxx/db/biolap.db' with dbproperties ('creator'='wdxx','date'='2019-06-26');

    ----------------------new---------------------------------------------
    create database if not exists bibuf comment 'bibuf系统库' location '/user/wdxx/db/bibuf.db' with dbproperties ('creator'='wdxx','date'='2020-04-26');
    create database if not exists bicore comment 'bicore系统库' location '/user/wdxx/db/bicore.db' with dbproperties ('creator'='wdxx','date'='2020-04-26');
    create database if not exists biodb comment 'biodb系统库' location '/user/wdxx/db/biodb.db' with dbproperties ('creator'='wdxx','date'='2020-04-26');
    create database if not exists bidwh comment 'bidwh系统库' location '/user/wdxx/db/bidwh.db' with dbproperties ('creator'='wdxx','date'='2020-04-26');
    create database if not exists biolap comment 'biolap系统库' location '/user/wdxx/db/biolap.db' with dbproperties ('creator'='wdxx','date'='2020-04-26');


    create table IF NOT EXISTS EXCEL_ZB_LIST
    (
    ywymc STRING,
    zywymc STRING,
    ztmc STRING,
    zbbm STRING,
    zbmc STRING,
    zbjc STRING,
    tjkj STRING,
    dw STRING,
    jsgs STRING,
    zbly STRING,
    zbcc STRING,
    tjpl STRING,
    sflj STRING,
    zbdy STRING,
    wd STRING,
    sfyszb STRING,
    clsj DATE default sysdate not null,
    file_name STRING,
    hzbbm STRING
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ' '
    STORED AS SEQUENCEFILE
    location /user/hive/warehouse/bisys.db/EXCEL_ZB_LIST


    create table TB_HZB_WD
    (
    wdbid STRING,
    wdcol STRING,
    wdlm STRING,
    wdtext STRING,
    zbbm STRING,
    hzbbm STRING
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ' '
    STORED AS SEQUENCEFILE
    location '/user/hive/warehouse/bisys.db/TB_HZB_WD';


    为了对表进行合理的管理以及提高查询效率,Hive可以将表组织成“分区”。一个分区实际上就是表下的一个目录,一个表可以在多个维度上进行分区,分区之间的关系就是目录树的关系
    1、创建分区表
    通过PARTITIONED BY子句指定,分区的顺序决定了谁是父目录,谁是子目录
    创建有一个分区的分区表:CREATE TABLE IF NOT EXISTS part_test(
    c1 string ,
    c2 string ,
    c3 string ,
    c4 string)PARTITIONED BY (day_id string)ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'STORED AS TEXTFILE;
    创建有两个分区的分区表:CREATE TABLE IF NOT EXISTS part_test_1(
    c1 string ,
    c2 string ,
    c3 string ,
    c4 string ) PARTITIONED BY (month_id string,day_id string)ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'STORED AS TEXTFILE;


    2、 外部分区表
    外部表也可以建成分区表,如hdfs目录/user/tuoming/part下有
    201805和201806两个目录,201805下有一个20180509子目录,201806下有20180609和20180610两个子目录。
    创建一个映射到/user/tuoming/part目录的外部分区表:CREATE EXTERNAL TABLE IF NOT EXISTS part_test_2(
    c1 string ,
    c2 string ,
    c3 string ,
    c4 string)PARTITIONED BY (month_id string,day_id string)ROW FORMAT DELIMITED FIELDS TERMINATED BY ','STORED AS TEXTFILELOCATION '/user/tuoming/part';
    为part_test_2增加分区:
    alter table part_test_2 add partition(month_id='201805',day_id='20180509') location '/user/tuoming/part/201805/20180509';
    alter table part_test_2 add partition(month_id='201806',day_id='20180609') location '/user/tuoming/part/201806/20180609';
    alter table part_test_2 add partition(month_id='201806',day_id='20180610') location '/user/tuoming/part/201806/20180610';
    使用show partitions语句查看part_test_2有哪些分区:show partitions part_test_2;


    3、 内部分区表创建一个主分区为month_id,子分区为day_id的内部分区表:CREATE TABLE IF NOT EXISTS part_test_3(
    c1 string ,
    c2 string ,
    c3 string ,
    c4 string )PARTITIONED BY (month_id string,day_id string)ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'STORED AS TEXTFILE;
    为内部分区表加载数据(1)使用load data inpath…overwrite into table partition语句从hdfs目录加载:
    load data inpath '/user/tuoming/test/test' overwrite into table part_test_3 partition(month_id='201805',day_id='20180509');
    (2)使用insert overwrite table/ insert into…partition语句从查询结果中加载:覆盖插入:
    insert overwrite table part_test_3 partition(month_id='201805',day_id='20180509') select * from part_test_temp;
    追加插入:
    insert into part_test_3 partition(month_id='201805',day_id='20180509') select * from part_test_temp;
    注意:使用以上两种方法为内部分区表加载数据不需要预创建分区,加载数据时会自动创建相应的分区。如果想要为内部表预先创建分区,需要使用hadoop fs –mkdir命令在表目录下先创建相应的分区目录,然后再使用alter table add partition语句增加分区:


    4、 删除分区
    使用alter table…drop partition语句删除对应分区:
    alter table part_test_3 drop partition(day_id='20180509');
    注意:外部分区表使用alter table…drop partition语句删除分区,只会删除元数据,相应的目录和文件并不会删除。内部表使用该语句删除分区,既会删除元数据,也会删除相应的目录和数据文件。

    5、 动态分区上述使用insert overwrite table…partition…从查询结果加载数据到分区,必须指定特定的分区,而且每个分区都需要使用一条插入语句。当需要一次插入多个分区的数据时,可以使用动态分区,根据查询得到的数据动态分配到分区里。动态分区与静态分区的区别就是不指定分区目录,由hive根据实际的数据选择插入到哪一个分区。#启动动态分区功能set hive.exec.dynamic.partition=true;
    #允许全部分区都是动态分区set hive.exec.dynamic.partition.mode=nonstrick;
    hive:默认允许动态分区个数为100,超出抛出异常:
    在执行插入数据到分区时,添加参数设置:
    set hive.exec.dynamic.partition=true;
    set hive.exec.dynamic.partition.mode=nonstrict;
    set hive.exec.max.dynamic.partitions.pernode=10000;注:这个属性表示每个节点生成动态分区的最大个数,默认是100
    set hive.exec.max.dynamic.partitions=10000;注:这个属性表示一个DML操作可以创建的最大动态分区数,默认是1000
    set hive.exec.max.created.files=10000;注:这个属性表示一个DML操作可以创建的最大文件数,默认是100000
    #month_id为静态分区,day_id为动态分区:insert overwrite table dynamic_test partition(month_id='201710',day_id) select c1,c2,c3,c4,c5,c6,c7,day_id from kafka_offsetwhere substr(day_id,1,6)='201710';
    # month_id和 day_id均为动态分区:insert overwrite table dynamic_test partition(month_id,day_id) select c1,c2,c3,c4,c5,c6,c7,substr(day_id,1,6) as month_id,day_id from kafka_offset;为了让分区列的值相同的数据尽量在同一个mapreduce中,这样每一个mapreduce可以尽量少的产生新的文件夹,可以借助distribute by的功能,将分区列值相同的数据放到一起。
    insert overwrite table dynamic_test partition(month_id,day_id)select c1,c2,c3,c4,c5,c6,c7,substr(day_id,1,6) as month_id,day_id from kafka_offsetdistribute by month_id,day_id;

    清表方法,如清除default库下面的所有表,如下方法
    hive -e "use default;show tables">tbname.txt
    cat tbname.txt|while read line
    do
    hive -e "use bisys;drop table $line"
    done


    HIVE优化后的一键脚本删除库下面的表,不同库下面只需修改库名
    hive -e "use biolap;show tables;">tbname.txt
    cat tbname.txt|while read line
    do
    echo -n "drop table $line;">>droptables.txt
    done
    tables=`cat droptables.txt`
    echo $tables
    hive -e "use biodb;$tables"
    rm -rf droptables.txt
    rm -rf tbname.txt

    动态分区要添加的参数

    set hive.exec.dynamic.partition=true;
    set hive.exec.dynamic.partition.mode=nonstrict;
    set hive.exec.max.dynamic.partitions.pernode=10000;
    set hive.exec.max.dynamic.partitions=10000;
    set hive.exec.max.created.files=10000;
    set spark.executor.memory=12g;
    set spark.executor.cores=1;
    set spark.executor.instances=10;
    set spark.sql.shuffle.partition=20000;

  • 相关阅读:
    asp.net留言板项目源代码下载
    HoverTree项目添加了查看留言列表功能
    HoverTree开源项目已经实现管理员登录
    HoverTree项目已经实现分层
    c# 连接Mysql数据库
    单行文字滚动就用myslider
    C#播放MP3源代码
    PHP 判断是否为 AJAX 请求
    c# TCP Socket通讯基础
    javascript类型注意事项
  • 原文地址:https://www.cnblogs.com/lianxuan1768/p/14484932.html
Copyright © 2011-2022 走看看