zoukankan      html  css  js  c++  java
  • Hive 进阶

    两种情况下不走map-reduce:

    1. where ds >' ' //ds 是partition

    2. select * from table //后面没有查询条件,什么都没有


    1.建表

    CREATE TABLE sal(
      id INT,
      name STRING,
      salary INT
      )
    partitioned by (city string)
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY '	'
    STORED AS TEXTFILE;




    1.1 修改表及属性

    #把id,name以外的列删除
    alter table sal replace columns (id int, name string); 
    #增加列
    alter table sal add columns (remark string);
    #修改column 
    ALTER TABLE table_name   
          CHANGE  col_old_name col_new_name   
          column_type;
          
    ALTER TABLE sal CHANGE remark city string;


    2.导入数据

    load data local inpath '/home/hadoop/in/mytable' overwrite into table sal;
    1	zuansun	3000	none
    2	zuansu2	4000	none
    3	zuansu3	3000	none
    4	zuansu4	4000	none
    5	zuansu5	3000	none
    6	zuansu6	4000	none
    7	zuansu7	3000	none
    8	zuansu8	4000	none
    9	zuansu9	10000	none
    10	zuansu10	20000	none
    11	zuansu11	15000	none
    12	zuansu12	25000	none


    3.嵌套查询

    from (select * from sal) e select e.id,e.name,e.salary  where e.salary>3000;
    #case when
    select id,name,
     case 
        when salary<10000 then '屌丝' 
        when salary>=10000 and salary<20000 then '中下等' 
        when salary>=20000 and salary<50000 then '高帅富'
        else '外星人' 
      end as salarylevel 
    from sal;


    4.group by 

    select remark,sum(salary) from sal group by remark;


    5.动态分区 

    5.1 创建临时表

    CREATE TABLE sal_tmp(
      id INT,
      name STRING,
      salary INT,
      city string
      )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY '	'
    STORED AS TEXTFILE;


    5.2将数据导入到临时表中

    load data local inpath '/home/hadoop/in/mytable' overwrite into table sal_tmp;


    5.3 操作的配置

    set hive.exec.dynamic.partition=true; // 允许动态分区
    set hive.exec.dynamic.partition.mode=nonstrict;  
    set hive.exec.dynamic.partitions.pernode=50000;  
    set hive.exec.dynamic.partitions.partitions=50000;  
    set hive.exec.max.created.files=500000;  
    set mapred.reduce.tasks =20000; //每个任务默认的reduce数目 
    set hive.merge.mapfiles=true;  //在Map-only的任务结束时合并小文件




    5.4附partition相关参数:

    hive.exec.dynamic.partition(缺省false): 设置为true允许使用dynamic partition
    hive.exec.dynamic.partition.mode(缺省strick):设置dynamic partition模式(nostrict允许所有partition列都为dynamic partition,strict不允许)
    hive.exec.max.dynamic.partitions.pernode (缺省100):每一个mapreduce job允许创建的分区的最大数量,如果超过了这个数量就会报错
    hive.exec.max.dynamic.partitions (缺省1000):一个dml语句允许创建的所有分区的最大数量
    hive.exec.max.created.files (缺省100000):所有的mapreduce job允许创建的文件的最大数量




    5.5
    insert into table sal partition (city) select * from sal_tmp;


    6. join操作

    #建表
    create table a(id int,gender string)
    row format delimited fields terminated by '	' stored as textfile;
    #加载数据
    load data local inpath '/home/hadoop/in/a' overwrite into table a;
    #内连接查询
    select sal.id,sal.name,sal.salary,sal.city,a.gender from sal join a on(sal.id=a.id);
    #左外连接查询
    select sal.id,sal.name,sal.salary,sal.city,a.gender from sal left outer join a on(sal.id=a.id);


    7.创建索引

    create index a_index on table a(id) AS  'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler' WITH DEFERRED REBUILD ;     


    8.桶

    #临时表
    create table tb_tmp(id int,age int, name string ,timeflag bigint) row format delimited fields terminated by ',';
    #带桶的表,4个桶
    create table tb_stu(id int,age int, name string,timeflag bigint) clustered by (id) sorted by (age) into 4 buckets row format delimited fields terminated by ',';
    #加载数据到临时表
    load data local inpath '/home/hadoop/in/tb_tmp' overwrite into table tb_tmp;
    
    
    1,20,zxm,20140330
    2,21,ljz,20140330
    3,19,cds,20140330
    4,18,mac,20140330
    5,22,android,20140330
    6,23,symbian,20140330
    7,25,wp,20140330
    8,20,cxd,20140330
    9,21,fvd,20140330
    10,19,cvb,20140330
    11,18,erd,20140330
    12,22,nid,20140330
    13,23,fvd,20140330
    14,19,cvb,20140330
    15,18,e33,20140330
    16,22,nid,20140330
    #设置执行桶的属性
    set hive.enforce.bucketing = true;
    #插入到tb_stu表
    insert into table tb_stu select * from tb_tmp;
    #抽样
    select * from tb_stu tablesample(bucket 1 out of 4 on id);
    注:tablesample是抽样语句,语法:TABLESAMPLE(BUCKET x OUT OF y),相当于以下语句:
    SELECT * FROM numbersflat WHERE number % y = x-1;


    9.RCfile

    #rcfile 格式表
    create table tb_rc(id int,age int, name string ,timeflag bigint) row format delimited fields terminated by ',' stored as rcfile;
    #插入数据,上表中已经有tb_tmp表,所以直接插入数据即可
    insert into table tb_rc select * from tb_tmp;




    10.分隔符的多样化(配合正则表达式使用)

    #cat /tmp/liuxiaowen/1.txt
     
    000377201207221125^^APPLE IPHONE 4S^^2
    132288201210331629^^THINKING IN JAVA^^1
    132288201210331629^^THIN ssss^^1111
    132288201210331629^^THdd dd ddJAVA^^10
    
    
    add jar /opt/app/hive-0.7.0-rc1/lib/hive-contrib-0.7.0.jar ;  
      
    create external table tt(times string,  
    product_name string,  
    sale_num string  
    ) ROW FORMAT  
    SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'  
    WITH SERDEPROPERTIES  
    ( 'input.regex' = '([^^]*)\^\^([^^]*)\^\^([^^]*)',  
    'output.format.string' = '%1$s %2$s %3$s')  
    STORED AS TEXTFILE;
    
    
    load data local inpath '/home/hadoop/in/tt' overwrite into table tt;
    
    
    hive> select product_name from tt;  
      
    APPLE IPHONE 4S  
    THINKING IN JAVA  
    THIN ssss  
    THdd dd ddJAVA 




    11.更加复杂的数据类型 

    11.1 array

    cat login_array.txt                                                                                                                    
    192.168.1.1,3105007010|3105007011|3105007012
    192.168.1.2,3105007020|3105007021|3105007022
    
    
    CREATE TABLE login_array (
      ip  STRING,
      uid  array<BIGINT>
    )
    PARTITIONED BY (dt STRING)
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ','
    COLLECTION ITEMS TERMINATED BY '|'
    STORED AS TEXTFILE;
    加载数据到hive表
    LOAD DATA LOCAL INPATH '/home/hadoop/in/login_array' OVERWRITE INTO TABLE login_array PARTITION (dt='20130101'); 
    #查看数据
    select * from login_array;
    192.168.1.1	[3105007010,3105007011,3105007012]	20130101
    192.168.1.2	[3105007020,3105007021,3105007022]	20130101
    select ip,uid[0] from login_array where dt='20130101'; --使用下标访问数组
    192.168.1.1	3105007010
    192.168.1.2	3105007020
    select ip,size(uid) from login_array where dt='20130101'; #查看数组长度
    192.168.1.1	3
    192.168.1.2	3
    select * from login_array where  array_contains(uid,3105007010);#数组查找
    192.168.1.1	[3105007010,3105007011,3105007012]	20130101


    11.2 使用Map

    cat map_test_raw:
    2014-03-03 12:22:34#127.0.0.1#get#amap#src=123&code=456&cookie=789#status=success&time=2s
    2014-03-03 11:22:34#127.0.0.1#get#autonavi#src=123&code=456#status=success&time=2s&cookie=789
    #创建表
    create external  table map_test_raw(ts String,ip String,type String,logtype String,request Map<String,String>,response Map<String,String>)
      ROW FORMAT DELIMITED FIELDS TERMINATED BY '#'
      COLLECTION ITEMS TERMINATED BY '&' 
      MAP KEYS TERMINATED BY '=' 
      stored as textfile;
    LOAD DATA LOCAL INPATH '/home/hadoop/in/map_test_raw' OVERWRITE INTO TABLE map_test_raw;
    #查看数据
    select * from map_test_raw;
    2014-03-03 12:22:34	127.0.0.1	get	amap	{"src":"123","code":"456","cookie":"789"}	{"status":"success","time":"2s"}
    2014-03-03 11:22:34	127.0.0.1	get	autonavi	{"src":"123","code":"456"}	{"status":"success","time":"2s","cookie":"789"}


    11.3 使用struct

    # cat login_struct.txt
    192.168.1.1,zhangsan:40
    192.168.1.1,lisi:41
    192.168.1.1,gavin:42
    192.168.1.1,wangwu:43
    192.168.1.1,xiaoming:44
    192.168.1.1,xiaojun:45
    # 建表
    CREATE TABLE login_struct (
      ip  STRING,
      user  struct<name:string,age:int>
    )
    ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ','
    COLLECTION ITEMS TERMINATED BY ':'
    STORED AS TEXTFILE;
    #导入数据
    LOAD DATA LOCAL INPATH '/home/hadoop/in/login_struct' OVERWRITE INTO TABLE login_struct; 
    #查看数据
    select ip,user from login_struct;
    
    
    192.168.1.1	{"name":"zhangsan","age":40}
    192.168.1.1	{"name":"lisi","age":41}
    192.168.1.1	{"name":"gavin","age":42}
    192.168.1.1	{"name":"wangwu","age":43}
    192.168.1.1	{"name":"xiaoming","age":44}
    192.168.1.1	{"name":"xiaojun","age":45}















    版权声明:本文为博主原创文章,未经博主允许不得转载。

  • 相关阅读:
    js 兼容阻止事件冒泡stopPropagation
    php session cookie
    js 给父元素的每个子元素绑定事件
    php 调用系统命令 超时
    php textarea换行
    php 中文字符串截取子串
    Predefined Asp.net skins(Themes) 你下载了吗?
    自定义DataFilter实现Atlas客户端DataView的数据筛选
    用Altas Behaviors实现就地编辑(1) [译]
    组件开发之Asp.net服务器控件Collection[集合]属性的设计时支持--编辑、保存
  • 原文地址:https://www.cnblogs.com/jamesf/p/4751557.html
Copyright © 2011-2022 走看看