zoukankan      html  css  js  c++  java
  • Hive常用函数 傻瓜学习笔记 附完整示例

    • 创建表

    drop table if exists mydatabase.test;
    create table mydatabase.test
        (id int, name string, timestring string, salary double, bonus double)
        row format delimited
        fields terminated by ' '
        stored as textfile;

    • 插入数据

    方式1
    vim test.txt
    hadoop fs -mkdir test
    hadoop fs -put test.txt /user/myname/test
    load data inpath '/user/myname/test'
        overwrite into table mydatabase.test;
    方式2
    insert into mydatabase.test values
        (1,' J ','2018-01-08 10:11:32',128.54,-45.23),
        (2,' J ','2018-02-09 10:51:12',128.54,-78.25),
        (3,' J ','2018-03-05 11:22:21',128.52,null),
        (4,' J ','2018-04-08 15:40:51',256.23,345.23),
        (5,' J ','2018-05-08 10:21:21',128.54,267.12),
        (6,' J ','2018-06-08 10:00:50',256.27,-78.49),
        (7,'Rose','2018-01-08 10:11:32',512.65,-76.44),
        (8,'Rose','2018-02-09 10:51:12',512.54,-45.30),
        (9,'Rose','2018-03-05 11:22:21',512.13,-87.09),
        (10,'Rose','2018-04-08 15:40:51',512.34,19.12),
        (11,'Dickson','2018-01-08 10:21:21',256.87,null),
        (12,'Dickson','2018-02-08 10:00:50',256.52,null),
        (13,'Dickson','2018-04-08 11:00:00',256.12,3.69);

    • 数学函数

    四舍五入
    select id, round(salary) from mydatabase.test;
    四舍五入,小数保留
    select id, round(salary, 1) from mydatabase.test;
    向下,向上取整
    select id, floor(salary), ceil(salary) from mydatabase.test;
    随机数(0~1)
    select id, salary*(1+rand()*0.1) from mydatabase.test;
    指数,对数,取模
    select id, pow(e(), salary), log(e(), salary), pmod(id, 3) from mydatabase.test;
    绝对值,最大值,最小值
    select id, abs(bonus), greatest(salary, bonus), least(salary, bonus) from mydatabase.test;

    • 类型转换函数

    select id, cast(salary as int) from mydatabase.test;

    • 日期函数

    当前时间
    select id, name, unix_timestamp() from mydatabase.test;
    时间戳转换为字符串
    select id, name, from_unixtime(unix_timestamp(), 'yyyy-MM-dd hh:mm:ss') from mydatabase.test;
    字符串转换为时间戳
    select id, name, unix_timestamp('2019-02-13 11:22:33') from mydatabase.test;
    字符串转换为时间戳
    select id, name, unix_timestamp('20190213 11:22:33', 'yyyyMMdd HH:mm:ss') from mydatabase.test;
    时间子元素
    select id, name, to_date(timestring), year(timestring), month(timestring), day(timestring), hour(timestring), minute(timestring), second(timestring) from mydatabase.test;

    • 条件函数

    IF条件
    select id, if(bonus > 0, 'yes', 'no') from mydatabase.test;
    NULL判断
    select id, isnull(bonus) from mydatabase.test;
    NULL条件,第二参数为默认值
    select id, nvl(bonus, 0) from mydatabase.test;
    非空查找函数
    select id, coalesce(bonus, 0, null) from mydatabase.test;
    CASE匹配条件
    select id, name,
        (case name
        when 'Jack' then 'A'
        when 'Rose' then 'B'
        else 'C'
        end)
        from mydatabase.test;
    CASE搜索条件
    select id, salary, bonus,
        (case
        when salary > 500 then 'A'
        when salary > 100 and bonus > 0 then 'B'
        else 'C'
        end)
        from mydatabase.test;

    • 聚合函数

    去重
    select distinct(name) from mydatabase.test;
    计数
    select count(*) from mydatabase.test;
    条件计数
    select count(bonus > 0) from mydatabase.test;
    求和,求平均,最大,最小,方差
    select name, sum(salary), avg(salary), min(salary), max(salary), variance(salary) from mydatabase.test group by name;
    生成列表
    select name, collect_list(salary) from mydatabase.test group by name;
    生成非重列表
    select name, collect_set(salary) from mydatabase.test group by name;

    • 字符串函数

    长度
    select name, length(name) from mydatabase.test;
    查找
    select name, locate('o', name) from mydatabase.test;
    左填充,右填充
    select name, lpad(name, 4, '_'), rpad(name, 4, '_') from mydatabase.test;
    去除左空格,去除右空格,去除左右空格
    select name, ltrim(name), rtrim(name), trim(name) from mydatabase.test;
    字符距离
    select n1, n2, levenshtein(n1, n2) from
      (select distinct(name) as n1 from mydatabase.test)db0
      join
      (select distinct(name) as n2 from mydatabase.test)db1
      on n1 != n2;
    分割
    select name, split(timestring, '-'), size(split(timestring, '-')) from mydatabase.test;
    分列
    select name, timesplit from from mydatabase.test lateral view explode(split(timestring, '-')) s as timesplit;
    子字符串
    select substr(name, 1, -1), substr(timestring, -8) from mydatabase.test;
    替换(注意转义替换可能需要四个斜杆)
    select regexp_replace(timestring, '\d+-\d+-\d+', '###') from mydatabase.test;
    提取(注意转义替换可能需要四个斜杆)
    select regexp_extract(timestring, '\d+', 1) from mydatabase.test;
    拼接
    select name, concat(year(timestring), '|', month(timestring), '|', cast(salary as string)) from mydatabase.test;
    拼接列表
    select name,
      concat_ws('|',
      collect_list(cast(salary as string))
      ) from mydatabase.test group by name;
    拼接有序列表
    select name,
      concat_ws('|',
      sort_array(
      collect_list(cast(salary as string))
      )) from mydatabase.test group by name;
    拼接有序列表并去除排序因子(注意转义替换可能需要两个/四个斜杆)
    select regexp_replace(
      concat_ws('|',
      sort_array(
      collect_list(
      concat(lpad(cast(rank as string), 3, '0'), ':', salary)
      ))), '\d+:', '') as lst
      from (select name, salary, row_number() over(order by salary) as rank from mydatabase.test) db0;

    • 生成函数

    EXPLODE
    select id, part from mydatabase.test lateral view explode(split(timestring,' ')) t as part;

    • 选择函数

    IN
    select id, name from mydatabase.test where name in('Dickson', 'Rose');

    • 分组函数1

    ROW_NUMBER
    select id, name, salary, row_number() over(partition by name order by salary desc) rank from mydatabase.test;
    RANK
    select id, name, salary, rank() over(partition by name order by salary desc) rank from mydatabase.test;
    DENSE_RANK
    select id, name, salary, dense_rank() over(partition by name order by salary desc) rank from mydatabase.test;
    SUM
    select_id, name, sum(salary) over(partition by name order by timestring asc) sum_salary from mydatabase.test;
    BEFORE
    select id, name, salary, lag(salary, 1) over(partition by name order by timestring asc) before_salary from mydatabase.test;
    AFTER
    select id, name, salary, lead(salary, 1) over(partition by name order by timestring asc) after_salary from mydatabase.test;

    • 分组函数2

    GROUPING SETS
    select month, day, sum(salary) from
        (select month(timestring) month, day(timestring) day, salary from mydatabase.test) db
        group by month, day grouping sets(month, (month, day)) order by month asc, day asc;
    CUBE
    select month, day, sum(salary) from
        (select month(timestring) month, day(timestring) day, salary from mydatabase.test) db
        group by month, day with cube order by month asc, day asc;
    ROLLUP
    select month, day, sum(salary) from
        (select month(timestring) month, day(timestring) day, salary from mydatabase.test) db
        group by month, day with rollup order by month asc, day asc;

    参考文献:

    https://www.cnblogs.com/MOBIN/p/5618747.html#7

    https://blog.csdn.net/zhanaolu4821/article/details/81871041

    https://baijiahao.baidu.com/s?id=1613382585734336695&wfr=spider&for=pc

    https://blog.csdn.net/guodong2k/article/details/79459282

    https://www.cnblogs.com/zhaohz/p/4672943.html

    https://www.cnblogs.com/Allen-rg/p/9268627.html

  • 相关阅读:
    Python数据类型的可变与不可变
    垃圾回收机制详解、运算符和格式化输出
    Python变量详解
    计算机硬件的基本组成-大框架的建立
    Python语言介绍
    MySQL 安装教程
    Java中遍历Map对象的方式
    判断字段的值是否为空
    Java中判断某一字符串是否包含数字、字母和中文
    把字符串字节数组写入文件
  • 原文地址:https://www.cnblogs.com/jhc888007/p/11085012.html
Copyright © 2011-2022 走看看