为什么
hive 窗口函数 http://wingyumin.com/2017/10/20/Hive%E5%AE%9E%E7%94%A8%E5%87%BD%E6%95%B0%E5%A4%A7%E5%85%A8/
hive普通函数 https://www.iteblog.com/archives/2258.html#i
参考文档:
《Apache Hive Cookbook》
# 示例表结构sales
CREATE TABLE `sales`(
`id` int,
`fname` string,
`state` string,
`zip` int,
`ip` string,
`pid` string)
# 示例数据sales
0 Zena Tennessee 21550 192.168.56.101 PI_09
1 Elaine Alaska 6429 192.168.56.101 PI_03
2 Sage Nevada 8899 192.168.56.102 PI_03
3 Cade Missouri 11233 192.168.56.103 PI_06
4 Abra New Jersry 21500 192.168.56.101 PI_09
5 Stone Nebraska 3560 192.168.56.104 PI_08
6 Regina Tennessee 21550 192.168.56.105 PI_10
7 Donova New York 95234 192.168.56.106 PI_05
8 Aileen Illinois 68284 192.168.56.106 PI_02
9 Maraam Hawaii 95234 192.168.56.107 PI_07
# 示例表结构po
CREATE TABLE `po`(
`cookieid` string,
`createtime` string,
`pv` int)
# 示例数据po
cookie1 2015-04-10 1
cookie1 2015-04-11 5
cookie1 2015-04-12 7
cookie1 2015-04-13 3
cookie1 2015-04-14 2
cookie1 2015-04-15 4
cookie1 2015-04-16 5
|
一、分析型函数
-
ROW_NUMBER
语法:- ROW_NUMBER() OVER (ORDER BY col)
为每个分组记录返回一个排序的数字。 - ROW_NUMBER() OVER (PARTITION BY col1 ORDER BY col2)
按照col1分组,在分组内对col2进行排序并返回顺序数字。
hive> select fname,pid,ip from sales;OKZena PI_09 192.168.56.101Elaine PI_03 192.168.56.101Sage PI_03 192.168.56.102Cade PI_06 192.168.56.103Abra PI_09 192.168.56.101Stone PI_08 192.168.56.104Regina PI_10 192.168.56.105Donova PI_05 192.168.56.106Aileen PI_02 192.168.56.106Maraam PI_07 192.168.56.107hive> select fname,pid,ip,row_number() over (order by ip) from sales;Abra PI_09 192.168.56.101 1Elaine PI_03 192.168.56.101 2Zena PI_09 192.168.56.101 3Sage PI_03 192.168.56.102 4Cade PI_06 192.168.56.103 5Stone PI_08 192.168.56.104 6Regina PI_10 192.168.56.105 7Aileen PI_02 192.168.56.106 8Donova PI_05 192.168.56.106 9Maraam PI_07 192.168.56.107 10hive> select fname,pid,ip,row_number() over (partition by pid order by ip) from sales;Aileen PI_02 192.168.56.106 1Elaine PI_03 192.168.56.101 1Sage PI_03 192.168.56.102 2Donova PI_05 192.168.56.106 1Cade PI_06 192.168.56.103 1Maraam PI_07 192.168.56.107 1Stone PI_08 192.168.56.104 1Abra PI_09 192.168.56.101 1Zena PI_09 192.168.56.101 2Regina PI_10 192.168.56.105 1 - ROW_NUMBER() OVER (ORDER BY col)
-
RANK
- RANK() OVER (ORDER BY col)
和ROW_NUMBER()差不多,但是排序时候相同的字段会返回相同的数字。 - RANK() OVER (PARTITION by col1 ORDER BY col2)
和ROW_NUMBER()差不多,但是排序时候同等级的字段会返回相同的数字。
hive> select fname,ip,rank() over (order by ip) from sales;Abra 192.168.56.101 1Elaine 192.168.56.101 1Zena 192.168.56.101 1Sage 192.168.56.102 4Cade 192.168.56.103 5Stone 192.168.56.104 6Regina 192.168.56.105 7Aileen 192.168.56.106 8Donova 192.168.56.106 8Maraam 192.168.56.107 10hive> select fname,pid,ip,rank() over (partition by pid order by ip) from sales;Aileen PI_02 192.168.56.106 1Elaine PI_03 192.168.56.101 1Sage PI_03 192.168.56.102 2Donova PI_05 192.168.56.106 1Cade PI_06 192.168.56.103 1Maraam PI_07 192.168.56.107 1Stone PI_08 192.168.56.104 1Abra PI_09 192.168.56.101 1Zena PI_09 192.168.56.101 1Regina PI_10 192.168.56.105 1 - RANK() OVER (ORDER BY col)
-
DENSE_RANK
和RANK()差不多,但是RANK()的排序数字存在空洞,见一-2-第一个示例。而DENSE_RANK()则不会存在排序数字空洞。hive> select fname,ip,dense_rank() over (order by ip) from sales;Abra 192.168.56.101 1Elaine 192.168.56.101 1Zena 192.168.56.101 1Sage 192.168.56.102 2Cade 192.168.56.103 3Stone 192.168.56.104 4Regina 192.168.56.105 5Aileen 192.168.56.106 6Donova 192.168.56.106 6Maraam 192.168.56.107 7 -
PERCENT_RANK
- PERCENT_RANK() OVER (ORDER BY col) 当前行的rank-1/总行数-1
- PERCENT_RAANK() OVER (PARTITION BY col1 ORDER BY col2) 按照col1分组后,分组内当前行的rank-1/分组内的总行数-1
hive> select id,ip,percent_rank() over (order by id)from sales;0 192.168.56.101 0.01 192.168.56.101 0.11111111111111112 192.168.56.102 0.22222222222222223 192.168.56.103 0.33333333333333334 192.168.56.101 0.44444444444444445 192.168.56.104 0.55555555555555566 192.168.56.105 0.66666666666666667 192.168.56.106 0.77777777777777788 192.168.56.106 0.88888888888888889 192.168.56.107 1.0hive> select id,ip,percent_rank() over (partition by ip order by id)from sales;0 192.168.56.101 0.01 192.168.56.101 0.54 192.168.56.101 1.02 192.168.56.102 0.03 192.168.56.103 0.05 192.168.56.104 0.06 192.168.56.105 0.07 192.168.56.106 0.08 192.168.56.106 1.09 192.168.56.107 0.0 -
CUME_DIST
- CUME_DIST() OVER (ORDER BY col) 小于等于col当前值的行数/总行数。
- CUME_DIST() OVER (PARTITION BY col1 ORDER BY col2) 按照col1分组后,分组内部小于等于col2当前值的行数/总行数。
hive> select id,ip from sales;OK0 192.168.56.1011 192.168.56.1012 192.168.56.1023 192.168.56.1034 192.168.56.1015 192.168.56.1046 192.168.56.1057 192.168.56.1068 192.168.56.1069 192.168.56.107hive> select id,ip,cume_dist() over (order by id)from sales;0 192.168.56.101 0.11 192.168.56.101 0.22 192.168.56.102 0.33 192.168.56.103 0.44 192.168.56.101 0.55 192.168.56.104 0.66 192.168.56.105 0.77 192.168.56.106 0.88 192.168.56.106 0.99 192.168.56.107 1.0hive> select id,ip,cume_dist() over (partition by ip order by id)from sales;0 192.168.56.101 0.33333333333333331 192.168.56.101 0.66666666666666664 192.168.56.101 1.02 192.168.56.102 1.03 192.168.56.103 1.05 192.168.56.104 1.06 192.168.56.105 1.07 192.168.56.106 0.58 192.168.56.106 1.09 192.168.56.107 1.0 -
NTILE
用于将分组数据按照顺序切分成n片,并返回当前切片值。hive> select id,ip, ntile(2) over (partition by ip order by id)from sales;0 192.168.56.101 11 192.168.56.101 14 192.168.56.101 22 192.168.56.102 13 192.168.56.103 15 192.168.56.104 16 192.168.56.105 17 192.168.56.106 18 192.168.56.106 29 192.168.56.107 1
二、窗口型函数
- LEAD
- LEAD() OVER (PARTITION BY col1 ORDER BY col2) 按照col1分组后,返回结果集中的下一个col2
- LAG
- OVER (PARTITION BY col1 ORDER BY col2) 按照col1分组后,返回结果集中的上一个col2
- FIRST_VALUE
- OVER (PARTITION BY col1 ORDER BY col2) 按照col1分组后,返回结果集中的第一个col2
- LAST_VALUE
- OVER (PARTITION BY col1 ORDER BY col2) 按照col1分组后,返回结果集中的最后一个col2
- OVER
- 聚集OVER
- COUNT
- MIN
- MAX
- AVG
- OVER WITH PARTITION BY
- OVER WITH PARTITION BY and ORDER BY
# 首先需要了解函数OVER(PARTITION BY col1 ORDER BY col2)
SELECT cookieid,
createtime,
pv,
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime) AS pv1, -- 默认为从起点到当前行
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pv2, --从起点到当前行,结果同pv1
SUM(pv) OVER(PARTITION BY cookieid) AS pv3, --分组内所有行
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS pv4, --当前行+往前3行
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN 3 PRECEDING AND 1 FOLLOWING) AS pv5, --当前行+往前3行+往后1行
SUM(pv) OVER(PARTITION BY cookieid ORDER BY createtime ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv6 ---当前行+往后所有行
FROM po;
cookie1 2015-04-16 5 27 27 27 14 14 5
cookie1 2015-04-15 4 22 22 27 16 21 9
cookie1 2015-04-14 2 18 18 27 17 21 11
cookie1 2015-04-13 3 16 16 27 16 18 14
cookie1 2015-04-12 7 13 13 27 13 16 21
cookie1 2015-04-11 5 6 6 27 6 13 26
cookie1 2015-04-10 1 1 1 27 1 6 27
# 其他聚集函数方式相同。
# LEAD
hive> select fname,ip,pid,lead(pid) over (partition by ip order by ip) from sales;
Abra 192.168.56.101 PI_09 PI_03 -- ip的分组中下一个pid的值为PI_03
Elaine 192.168.56.101 PI_03 PI_09
Zena 192.168.56.101 PI_09 NULL
Sage 192.168.56.102 PI_03 NULL
Cade 192.168.56.103 PI_06 NULL
Stone 192.168.56.104 PI_08 NULL
Regina 192.168.56.105 PI_10 NULL
Aileen 192.168.56.106 PI_02 PI_05
Donova 192.168.56.106 PI_05 NULL
Maraam 192.168.56.107 PI_07 NULL
# LAG
hive> select fname,ip,pid,lag(pid) over (partition by ip order by ip) from sales;
Abra 192.168.56.101 PI_09 NULL
Elaine 192.168.56.101 PI_03 PI_09 -- ip的分组中上一个pid的值为PI_09
Zena 192.168.56.101 PI_09 PI_03
Sage 192.168.56.102 PI_03 NULL
Cade 192.168.56.103 PI_06 NULL
Stone 192.168.56.104 PI_08 NULL
Regina 192.168.56.105 PI_10 NULL
Aileen 192.168.56.106 PI_02 NULL
Donova 192.168.56.106 PI_05 PI_02
Maraam 192.168.56.107 PI_07 NULL
# FIRST_VALUE
hive> select fname,ip,pid,first_value(pid) over (partition by ip order by ip) from sales;
Abra 192.168.56.101 PI_09 PI_09 -- ip的分组中第一个pid的值为PI_09
Elaine 192.168.56.101 PI_03 PI_09
Zena 192.168.56.101 PI_09 PI_09
Sage 192.168.56.102 PI_03 PI_03
Cade 192.168.56.103 PI_06 PI_06
Stone 192.168.56.104 PI_08 PI_08
Regina 192.168.56.105 PI_10 PI_10
Aileen 192.168.56.106 PI_02 PI_02 -- ip的分组中第一个pid的值为PI_02
Donova 192.168.56.106 PI_05 PI_02
Maraam 192.168.56.107 PI_07 PI_07
# LAST_VALUE
hive> select fname,ip,pid,last_value(pid) over (order by ip) from sales;
Abra 192.168.56.101 PI_09 PI_09 -- ip的分组中最后一个pid的值为PI_09
Elaine 192.168.56.101 PI_03 PI_09
Zena 192.168.56.101 PI_09 PI_09
Sage 192.168.56.102 PI_03 PI_03
Cade 192.168.56.103 PI_06 PI_06
Stone 192.168.56.104 PI_08 PI_08
Regina 192.168.56.105 PI_10 PI_10
Aileen 192.168.56.106 PI_02 PI_05
Donova 192.168.56.106 PI_05 PI_05 -- ip的分组中最后一个pid的值为PI_05
Maraam 192.168.56.107 PI_07 PI_07
|
二、数值型函数
-
abs(x)
返回x的绝对值。hive> select abs(-1);1hive> select abs(1);1hive> select abs(0);0 -
bin(x)
返回x的二进制格式。hive> select bin(123);1111011hive> select bin(2);10hive> select bin(3);11 -
ceil(x),ceiling(x)
返回x向上取整整数。hive> select ceil(10.01);11hive> select ceil(0.01);1hive> select ceil(-10.01);-10 -
conv(x,y,z)
进制转换,将x从y进制转换为z进制。# 将10从十进制转换为二进制hive> select conv(10,10,2);1010 -
floor(x)
返回x向下取整整数。hive> select floor(10.01);10hive> select floor(11.01);11hive> select floor(11.77);11 -
greatest(x,y,z…)
返回x,y,z….中数值最大的值。hive> select greatest(1,8,2,3,-10,-12);8 -
least(x,y,z,…)
返回x,y,z…中数值最小的值。hive> select least(1,8,2,3,-10,-12);-12 -
negative(x)
返回x的负值。hive> select negative(1);-1hive> select negative(-1);1 -
round(x)
x取整。hive> select round(10.12);10hive> select round(10.92);11hive> select round(10);10hive> select round(10.5);11 -
sign(x)
当x是正数时返回1,当x是负数时返回-1,当x是0时返回0。
三、类型转换型函数
-
binary(x)
将x以二进制的方式存储。 -
cast(x as T)
将x转换为T类型。hive> select cast(10 as STRING);10hive> select cast(10 as INT);10hive> select cast('A' as INT);NULL
四、日期函数
-
add_month(x,y)
在日期x的基础上加上y月。hive> select add_months('2017-10-09',1);2017-11-09hive> select add_months('2017-10-09',4);2018-02-09 -
current_date()
获取当前日期。hive> select current_date();OK2017-10-20 -
current_timestamp()
获取当前时间。hive> select current_timestamp();2017-10-20 18:59:52.19 -
date_add(x,y)
在日期x的基础上加上y天。hive> select date_add('2017-10-09',4);2017-10-13hive> select date_add('2017-12-30',4);2018-01-03 -
date_format(x,y)
将日期x的格式化为y形式的时间。
y的格式请点击参考。 -
date_sub(x,y)
在日期x的基础上减去y天。 -
datediff(x,y)
返回日期x,y之间的时间差(天数)。hive> select datediff('2017-10-09','2017-10-01');8 -
unix_timestamp()
获取当前时区的UNIX时间戳 -
from_unixtime(x)
返回时间戳x的直观日期。hive> select from_unixtime(unix_timestamp());2017-10-20 19:13:11 -
last_day(x)
返回日期x的月份的最后一天的日期。hive> select last_day('2017-02-02');2017-02-28 -
months_between(x,y)
返回x,y之间的月份差。hive> select months_between('2017-10-01','2017-02-02');OK7.96774194
四、字符型函数
-
concat(x,y,….)
连接x,y….字符串合并为一个字符串。hive> select concat('ABC','abc','zzz');ABCabczzz -
concat_ws(z,x,y,….)
实用分隔符z连接x,y….字符串。hive> select concat_ws('/','ABC','abc','zzz');ABC/abc/zzz -
find_in_set(‘x’,’y,z,…’)
检查x是否存在与y,z…中,存在则返回位置值,否则返回0。hive> select find_in_set('abc','a,b,c,ab,abc,bc');5hive> select find_in_set('abc','a,b,c,ab,bc');0 -
in_file(x,y)
检查字符串x是否为文件y的一行。 -
initcap(x)
将字符串x的首字母大写,然后将其他字母小写。hive> select initcap('hello');Hellohive> select initcap('hellO');Hellohive> select initcap('hELLO');Hellohive> select initcap('HeLLO');Hellohive> select initcap('HELLO');Hello -
instr(x,y)
返回y在x中的第一个位置值。hive> select instr('abc','c');3hive> select instr('abc','d');0 -
length(x)
返回x的字符个数hive> select length('abc');3hive> select length('中国');2 -
lower(x),lcase(x)
返回x的小写字符串。hive> select lower('ABC');abc -
locate(x,y,z)
返回x在y的位置z之后第一次出现的位置。 -
lpad(x,y,z)
将x左侧用字符串z填充,xz组合的总长度为zhive> select lpad('a',2,'b');bahive> select lpad('a',3,'b');bbahive> select lpad('aaaa',3,'b');aaa -
ltrim(x)
去除x左侧的空格。hive> select ltrim(' AA');AA -
repeat(x,y)
将x重复y次。hive> select repeat('a',5);aaaaa -
reverse(x)
将x逆向输出。hive> select reverse('SUV');VUS -
rpad(x,y,z)
将x左侧用字符串z填充,xz组合的总长度为z。 -
rtrim(x)
去除x右侧的空格。 -
split(x,y)
用y分割字符串x,y为正则表达式。 -
substr(x,y),substring(x,y)
返回x从位置y直到结尾的字符串。 -
substr(x,y,z),substring(x,y,z)
返回x从位置y开始的字符串,长度为z的子串。 -
trim(x)
去掉x两端的空格。
五、条件函数
-
case when
# 方式1case when a=b then b1 when a=c then c1 else a end as col# 方式2case a when b then b1 when c then c1 else a end as col -
coalsce(x,y,z)
当x is null时返回y,当x is not null 返回z。 -
if(x,y,z)
当条件x成立时返回y,当条件x不成立时返回z。 -
isnotnull(x)
当x is not null时返回true,当x i null 返回false。 -
isnull(x)
当x is null时返回true,当x is not null 返回false。 -
nvl(x,y)
当x is null时返回y,当x is not null 返回x。hive> select nvl(1,100);1hive> select nvl(null,100);100
六、UDAF函数
- avg()
返回平均值 - count()
返回总数 - max()
返回最大值 - min()
返回最小值 - sum()
返回总和 - variance()
返回方差