zoukankan      html  css  js  c++  java
  • Hive 进阶

    两种情况下不走map-reduce:

    1. where ds >' ' //ds 是partition

    2. select * from table //后面没有查询条件,什么都没有


    1.建表

    CREATE TABLE sal(
      id INT,
      name STRING,
      salary INT
      )
    partitioned by (city string)
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY '	'
    STORED AS TEXTFILE;




    1.1 修改表及属性

    #把id,name以外的列删除
    alter table sal replace columns (id int, name string); 
    #增加列
    alter table sal add columns (remark string);
    #修改column 
    ALTER TABLE table_name   
          CHANGE  col_old_name col_new_name   
          column_type;
          
    ALTER TABLE sal CHANGE remark city string;


    2.导入数据

    load data local inpath '/home/hadoop/in/mytable' overwrite into table sal;
    1	zuansun	3000	none
    2	zuansu2	4000	none
    3	zuansu3	3000	none
    4	zuansu4	4000	none
    5	zuansu5	3000	none
    6	zuansu6	4000	none
    7	zuansu7	3000	none
    8	zuansu8	4000	none
    9	zuansu9	10000	none
    10	zuansu10	20000	none
    11	zuansu11	15000	none
    12	zuansu12	25000	none


    3.嵌套查询

    from (select * from sal) e select e.id,e.name,e.salary  where e.salary>3000;
    #case when
    select id,name,
     case 
        when salary<10000 then '屌丝' 
        when salary>=10000 and salary<20000 then '中下等' 
        when salary>=20000 and salary<50000 then '高帅富'
        else '外星人' 
      end as salarylevel 
    from sal;


    4.group by 

    select remark,sum(salary) from sal group by remark;


    5.动态分区 

    5.1 创建临时表

    CREATE TABLE sal_tmp(
      id INT,
      name STRING,
      salary INT,
      city string
      )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY '	'
    STORED AS TEXTFILE;


    5.2将数据导入到临时表中

    load data local inpath '/home/hadoop/in/mytable' overwrite into table sal_tmp;


    5.3 操作的配置

    set hive.exec.dynamic.partition=true; // 允许动态分区
    set hive.exec.dynamic.partition.mode=nonstrict;  
    set hive.exec.dynamic.partitions.pernode=50000;  
    set hive.exec.dynamic.partitions.partitions=50000;  
    set hive.exec.max.created.files=500000;  
    set mapred.reduce.tasks =20000; //每个任务默认的reduce数目 
    set hive.merge.mapfiles=true;  //在Map-only的任务结束时合并小文件




    5.4附partition相关参数:

    hive.exec.dynamic.partition(缺省false): 设置为true允许使用dynamic partition
    hive.exec.dynamic.partition.mode(缺省strick):设置dynamic partition模式(nostrict允许所有partition列都为dynamic partition,strict不允许)
    hive.exec.max.dynamic.partitions.pernode (缺省100):每一个mapreduce job允许创建的分区的最大数量,如果超过了这个数量就会报错
    hive.exec.max.dynamic.partitions (缺省1000):一个dml语句允许创建的所有分区的最大数量
    hive.exec.max.created.files (缺省100000):所有的mapreduce job允许创建的文件的最大数量




    5.5
    insert into table sal partition (city) select * from sal_tmp;


    6. join操作

    #建表
    create table a(id int,gender string)
    row format delimited fields terminated by '	' stored as textfile;
    #加载数据
    load data local inpath '/home/hadoop/in/a' overwrite into table a;
    #内连接查询
    select sal.id,sal.name,sal.salary,sal.city,a.gender from sal join a on(sal.id=a.id);
    #左外连接查询
    select sal.id,sal.name,sal.salary,sal.city,a.gender from sal left outer join a on(sal.id=a.id);


    7.创建索引

    create index a_index on table a(id) AS  'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' WITH DEFERRED REBUILD ;     


    8.桶

    #临时表
    create table tb_tmp(id int,age int, name string ,timeflag bigint) row format delimited fields terminated by ',';
    #带桶的表,4个桶
    create table tb_stu(id int,age int, name string,timeflag bigint) clustered by (id) sorted by (age) into 4 buckets row format delimited fields terminated by ',';
    #加载数据到临时表
    load data local inpath '/home/hadoop/in/tb_tmp' overwrite into table tb_tmp;
    
    
    1,20,zxm,20140330
    2,21,ljz,20140330
    3,19,cds,20140330
    4,18,mac,20140330
    5,22,android,20140330
    6,23,symbian,20140330
    7,25,wp,20140330
    8,20,cxd,20140330
    9,21,fvd,20140330
    10,19,cvb,20140330
    11,18,erd,20140330
    12,22,nid,20140330
    13,23,fvd,20140330
    14,19,cvb,20140330
    15,18,e33,20140330
    16,22,nid,20140330
    #设置执行桶的属性
    set hive.enforce.bucketing = true;
    #插入到tb_stu表
    insert into table tb_stu select * from tb_tmp;
    #抽样
    select * from tb_stu tablesample(bucket 1 out of 4 on id);
    注:tablesample是抽样语句,语法:TABLESAMPLE(BUCKET x OUT OF y),相当于以下语句:
    SELECT * FROM numbersflat WHERE number % y = x-1;


    9.RCfile

    #rcfile 格式表
    create table tb_rc(id int,age int, name string ,timeflag bigint) row format delimited fields terminated by ',' stored as rcfile;
    #插入数据,上表中已经有tb_tmp表,所以直接插入数据即可
    insert into table tb_rc select * from tb_tmp;




    10.分隔符的多样化(配合正则表达式使用)

    #cat /tmp/liuxiaowen/1.txt
     
    000377201207221125^^APPLE IPHONE 4S^^2
    132288201210331629^^THINKING IN JAVA^^1
    132288201210331629^^THIN ssss^^1111
    132288201210331629^^THdd dd ddJAVA^^10
    
    
    add jar /opt/app/hive-0.7.0-rc1/lib/hive-contrib-0.7.0.jar ;  
      
    create external table tt(times string,  
    product_name string,  
    sale_num string  
    ) ROW FORMAT  
    SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'  
    WITH SERDEPROPERTIES  
    ( 'input.regex' = '([^^]*)\^\^([^^]*)\^\^([^^]*)',  
    'output.format.string' = '%1$s %2$s %3$s')  
    STORED AS TEXTFILE;
    
    
    load data local inpath '/home/hadoop/in/tt' overwrite into table tt;
    
    
    hive> select product_name from tt;  
      
    APPLE IPHONE 4S  
    THINKING IN JAVA  
    THIN ssss  
    THdd dd ddJAVA 




    11.更加复杂的数据类型 

    11.1 array

    cat login_array.txt                                                                                                                    
    192.168.1.1,3105007010|3105007011|3105007012
    192.168.1.2,3105007020|3105007021|3105007022
    
    
    CREATE TABLE login_array (
      ip  STRING,
      uid  array<BIGINT>
    )
    PARTITIONED BY (dt STRING)
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ','
    COLLECTION ITEMS TERMINATED BY '|'
    STORED AS TEXTFILE;
    加载数据到hive表
    LOAD DATA LOCAL INPATH '/home/hadoop/in/login_array' OVERWRITE INTO TABLE login_array PARTITION (dt='20130101'); 
    #查看数据
    select * from login_array;
    192.168.1.1	[3105007010,3105007011,3105007012]	20130101
    192.168.1.2	[3105007020,3105007021,3105007022]	20130101
    select ip,uid[0] from login_array where dt='20130101'; --使用下标访问数组
    192.168.1.1	3105007010
    192.168.1.2	3105007020
    select ip,size(uid) from login_array where dt='20130101'; #查看数组长度
    192.168.1.1	3
    192.168.1.2	3
    select * from login_array where  array_contains(uid,3105007010);#数组查找
    192.168.1.1	[3105007010,3105007011,3105007012]	20130101


    11.2 使用Map

    cat map_test_raw:
    2014-03-03 12:22:34#127.0.0.1#get#amap#src=123&code=456&cookie=789#status=success&time=2s
    2014-03-03 11:22:34#127.0.0.1#get#autonavi#src=123&code=456#status=success&time=2s&cookie=789
    #创建表
    create external  table map_test_raw(ts String,ip String,type String,logtype String,request Map<String,String>,response Map<String,String>)
      ROW FORMAT DELIMITED FIELDS TERMINATED BY '#'
      COLLECTION ITEMS TERMINATED BY '&' 
      MAP KEYS TERMINATED BY '=' 
      stored as textfile;
    LOAD DATA LOCAL INPATH '/home/hadoop/in/map_test_raw' OVERWRITE INTO TABLE map_test_raw;
    #查看数据
    select * from map_test_raw;
    2014-03-03 12:22:34	127.0.0.1	get	amap	{"src":"123","code":"456","cookie":"789"}	{"status":"success","time":"2s"}
    2014-03-03 11:22:34	127.0.0.1	get	autonavi	{"src":"123","code":"456"}	{"status":"success","time":"2s","cookie":"789"}


    11.3 使用struct

    # cat login_struct.txt
    192.168.1.1,zhangsan:40
    192.168.1.1,lisi:41
    192.168.1.1,gavin:42
    192.168.1.1,wangwu:43
    192.168.1.1,xiaoming:44
    192.168.1.1,xiaojun:45
    # 建表
    CREATE TABLE login_struct (
      ip  STRING,
      user  struct<name:string,age:int>
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ','
    COLLECTION ITEMS TERMINATED BY ':'
    STORED AS TEXTFILE;
    #导入数据
    LOAD DATA LOCAL INPATH '/home/hadoop/in/login_struct' OVERWRITE INTO TABLE login_struct; 
    #查看数据
    select ip,user from login_struct;
    
    
    192.168.1.1	{"name":"zhangsan","age":40}
    192.168.1.1	{"name":"lisi","age":41}
    192.168.1.1	{"name":"gavin","age":42}
    192.168.1.1	{"name":"wangwu","age":43}
    192.168.1.1	{"name":"xiaoming","age":44}
    192.168.1.1	{"name":"xiaojun","age":45}















    版权声明:本文为博主原创文章,未经博主允许不得转载。

  • 相关阅读:
    ffmpeg rtmp推流 视频转码
    java日志发展史 log4j slf4j log4j2 jul jcl 日志和各种桥接包的关系
    nginx stream 流转发,可以转发rtmp、mysql访问流,转发rtmp、jdbc请求
    java web http 转https 通过nginx代理访问
    linux 服务器磁盘挂载
    novnc 通过websockify代理 配置多点访问
    linux 文件服务 minio 安装部署配置
    AOP实现原理,手写aop
    java 泛型
    JAVA反射getGenericSuperclass()用法
  • 原文地址:https://www.cnblogs.com/jamesf/p/4751557.html
Copyright © 2011-2022 走看看