时间函数
当前日期
select current_date(); 2018-11-14
当前时间戳
select current_timestamp(); 2018-11-14 21:35:16.237
date_format()
select date_format(current_date(),'yyyyMMdd'); 20181114 select date_format(current_timestamp(),'yyyyMMdd'); 20181114
unix_timestamp()
select unix_timestamp(); +-----------------------------------------------------------+--+ | unix_timestamp(current_timestamp(), yyyy-MM-dd HH:mm:ss) | +-----------------------------------------------------------+--+ | 1542202845 | +-----------------------------------------------------------+--+
from_unixtime()
select from_unixtime(unix_timestamp(),'yyyyMMdd HH:mm:ss'); +--------------------------------------------------------------------------------------------- | from_unixtime(unix_timestamp(current_timestamp(), yyyy-MM-dd HH:mm:ss), yyyyMMdd HH:mm:ss) +--------------------------------------------------------------------------------------------- | 20181114 21:43:14
日期差值
0: jdbc:hive2://s101:10000/lx> select datediff('2019-08-02','2019-08-06'); +---------------------------------------------------------------+--+ | datediff(CAST(2019-08-02 AS DATE), CAST(2019-08-06 AS DATE)) | +---------------------------------------------------------------+--+ | -4 | +---------------------------------------------------------------+--+
字符串函数
split
select explode(split('hello',''));
substr
select substr('hello',1,3);
trim去除前后空格
select trim(' hello ');
format_number
select format_number(1234.345,1);
concat
length
条件语句
if
select if(w1 >2 ,w1,w2) from www;
true则w1,false则w2
case when then
select case when w1 > 2 then w1 when w1 <= 2 then w2 end from www;
窗口函数
lead
lead(input[, offset[, default]]) 上提 input: 上提列 offset: 上提行数,可选,默认是 1 行 default:填充值,可选,默认是null 使用:select id,name,lead(id,2,'qq')over(partition by id order by id) lad from www; 注意:2可选,不写默认1 ‘qq’可选,不写默认null,类型需要对应,此处id是int,所以还是为null partition by id:可选,分组后对每个组进行lead order by id:必须写
lag
lag(input[, offset[, default]]) 下拉 input: 下拉列 offset: 下拉行数,可选,默认是 1 行 default:填充值,可选,默认是null 使用:select id,name,lag(id,2,11)over(partition by id order by id) lag from www; 注意:2可选,不写默认1 11可选,不写默认null partition by id:可选,分组后对每个组进行lag order by id:必须写
first_value
first_value(expr[, isIgnoreNull]) expr:列名或一个表达式 isIgnoreNull:true或false,如果是true将跳过null值,可选,默认false select id,name,first_value(concat(cast(id as string), name),true)over(partition by name order by id) lag from www;
last_value
select id,name,last_value(concat(cast(id as string), name),true)over(order by id ) lag from www; +-----+-------+--------+--+ | id | name | lag | +-----+-------+--------+--+ | 1 | a | 1a | | 2 | b | 2b | | 3 | c | 3c | | 4 | c | 4c | | 5 | c | 5c | | 6 | d | 6d | | 7 | b | 7b | | 8 | a | 8a | | 9 | a | 9a | | 12 | eee | 12eee | +-----+-------+--------+--+select id,name from www; +-----+-------+--+ | id | name | +-----+-------+--+ | 12 | eee | | 1 | a | | 2 | b | | 3 | c | | 4 | c | | 5 | c | | 6 | d | | 7 | b | | 8 | a | | 9 | a | +-----+-------+--+ select id,name,last_value(concat(cast(id as string), name),true)over() lag from www; +-----+-------+------+--+ | id | name | lag | +-----+-------+------+--+ | 12 | eee | 6d | | 7 | b | 6d | | 8 | a | 6d | | 9 | a | 6d | | 1 | a | 6d | | 2 | b | 6d | | 3 | c | 6d | | 4 | c | 6d | | 5 | c | 6d | | 6 | d | 6d | +-----+-------+------+--+
over和标准聚合函数
select distinct name,count(name)over(partition by name) s from www; 求分区个数并去重 select distinct name,sum(id)over(partition by name) s from www; 分区id和并去重 select id,name,max(length(name))over() from www; 总体的最大长度 select name,min(id)over(partition by name) s from www; 每个分区最小id select name,max(id)over(partition by name) s from www; 每个分区最大id select name,avg(id)over(partition by name) s from www; 每个分区平均id
over和partiton by
over和partition by order by
select first_value(id)over(partition by id,name) from www; select first_value(id)over(partition by id,name order by id,name) from www;
以行限定窗口范围
select * ,sum(grade)over(order by grade desc rows between current row and 1 following) from sg;
select * ,sum(grade)over(order by grade desc rows between current row and unbounded following) from sg;
select * ,sum(grade)over(order by grade desc rows between unbounded preceding and current row) from sg;
select * ,sum(grade)over(order by grade desc rows between unbounded preceding and unbounded following) from sg;
以值限定窗口范围
select * ,sum(grade)over(order by grade desc range between unbounded preceding and current row) from sg;
排名函数
rank():并列跳跃
dense_rank():并列连续,不跳跃
row_number():连续
Assigns a unique,sequential number to each row,
starting with one,
according to the ordering of rows within the window partition
//商家内用户访问次数倒序排列,取前三个
select * from ( select * ,row_number()over(partition by id order by count desc)b from ( select id,uu,count(*) as count from shangjia where uu is not null group by id,uu)a)c where b <= 3;
cume_dist()
select cume_dist()over(order by mid ) from t1; //小于等于当前值的行数/分组内总行数 select cume_dist()over(order by mid desc) from t1; //大于等于当前值的行数/分组内总行数
percent_rank
ntile(n):每个分区按一定顺序分成n份
高级聚合函数
grouping sets
select *,count(1),grouping_id() from pv group by t1,t2,s3 grouping sets(t1,t2,s3);
grouping__id //分组的组号(可选) grouping sets(t1,t2,s3,()) //相当于分别对t1,t2,s3,null进行分组并用union all连接
group by ... with cube
select *,count(1),grouping_id() from pv group by t1,t2,s3 with cube order by grouping_id();
八种
rollup
select *,count(1),grouping_id() from pv group by t1,t2,s3 with rollup order by grouping_id();
null、t1、t1和t2、t1和t2和t3
排序函数
order by age
全排序,一个reduce;需要加limit,在map阶段在每个分区中取出前n个元素,交给r处理
sort by age
部分排序
distribute by age
哈希分区
cluster by
distribute by + sort by
select age from user_order distribute by age sort by age;