zoukankan      html  css  js  c++  java
  • hive练习-行列转换 窗口函数

    1、查看数据

    $ head orders.csv
    order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
    2539329,1,prior,1,2,08,
    2398795,1,prior,2,3,07,15.0
    473747,1,prior,3,3,12,21.0
    2254736,1,prior,4,4,07,29.0
    431534,1,prior,5,4,15,28.0
    3367565,1,prior,6,2,07,19.0
    550135,1,prior,7,1,09,20.0
    3108588,1,prior,8,1,14,14.0
    2295261,1,prior,9,1,16,0.0
    
    
    $ head order_products__prior.csv
    order_id,product_id,add_to_cart_order,reordered
    2,33120,1,1
    2,28985,2,1
    2,9327,3,0
    2,45918,4,1
    2,30035,5,0
    2,17794,6,1
    2,40141,7,1
    2,1819,8,1
    2,43668,9,0

    2、创建表结构

    create table orders(
    
    order_id string,
    user_id string,
    eval_set string,
    order_number bigint,
    order_dow string,
    order_hour_of_day string,
    days_since_prior_order string
    )row format delimited fields terminated by ','
    lines terminated by '
    ';
    
    create table order_products__prior(
    order_id string,
    product_id string,
    add_to_cart_order string,
    reordered string
    )row format delimited fields terminated by ','
    lines terminated by '
    ';

    3、每个用户有多少个订单[orders](聚合count)

    select user_id,count(*) from orders group by user_id 
    order by user_id desc limit 3;
    
    99999    35
    99998    6
    99997    4

    4、每个用户买了多少商品(表连接)

    select user_id,count(*) from 
    orders join order_products__prior
    on(orders.order_id=order_products__prior.order_id) 
    group by user_id
    order by user_id desc limit 3;
    99999    329
    99998    21
    99997    38

    5、每个用户平均每个订单是多少商品(聚合函数avg)

    select user_id,avg(cnt)
    from orders join
    (
    select order_id,count(*) as cnt from order_products__prior group by order_id)t
    on (orders.order_id=t.order_id)
    group by user_id
    order by user_id desc limit 3;
    
    99999    9.676470588235293
    99998    4.2
    99997    12.666666666666666

    6、每个用户一周中的订单分布(行装列,case when)

    select user_id,
    sum(case when order_dow=='0' then '1' else '0' end )as dow_0,
    sum(case when order_dow=='1' then '1' else '0' end )as dow_1,
    sum(case when order_dow=='2' then '1' else '0' end )as dow_2,
    sum(case when order_dow=='3' then '1' else '0' end )as dow_3,
    sum(case when order_dow=='4' then '1' else '0' end )as dow_4,
    sum(case when order_dow=='5' then '1' else '0' end )as dow_5,
    sum(case when order_dow=='6' then '1' else '0' end )as dow_6
    from orders
    group by user_id
    order by user_id desc
    limit 10;

    7、每个用户平均每个购买天中,购买的商品数量(if补充缺失值)

    --方法1
    select user_id,avg(s_cnt) as avg_cnt
    from
    (select user_id,
    if(days_since_prior_order="",0,days_since_prior_order) as days_since_prior_order,
    sum(cnt) as s_cnt from 
    orders
    join 
    (select order_id,count(*) as cnt from order_products__prior group by order_id)t
    on (orders.order_id=t.order_id)
    group by user_id,days_since_prior_order)t2
    group by user_id
    order by user_id desc limit 3;
    
    --方法二
    select user_id,sum(cnt)/count(distinct days_since_prior_order)
    from orders 
    join
    (select order_id,count(*) as cnt from order_products__prior group by order_id)t
    on 
    (orders.order_id=t.order_id)
    group by user_id
    order by user_id desc limit 3;

    8、每个用户最喜欢的3个商品(字段链接concat_ws,列转行collect_list或者concat_set,排序row_number() over(partition by x order by y) as z)

    select user_id,collect_list(concat_ws("_",product_id,cast(r as string))) as top3
    from
    (
    select user_id, product_id,
    row_number() over(partition by user_id order by cnt desc) as r,
    cnt
    from
    (select user_id,product_id,count(*) as cnt
    from orders
    join order_products__prior
    on(orders.order_id=order_products__prior.order_id)
    group by user_id,product_id
    )t1
    )t2
    where r<=3
    group by user_id
    order by user_id desc
    limit 10;

     总结: 

    1、函数的使用:

    concat_ws

    collect_list

    row_number() over(partition by xxx order by)

    2、hive优化:

  • 相关阅读:
    rest_framework规范
    跨域问题
    网站部署(二)
    服务器更改密码后,git不能连接问题
    网站部署(一)
    Ajax
    AJAX基本使用
    Java之JNDI详解
    IntelliJ IDEA(2018.3.3) 的介绍、安装、破解、配置与使用
    数据库修改密码风险高,如何保证业务持续,这几种密码双活方案可以参考
  • 原文地址:https://www.cnblogs.com/students/p/12825710.html
Copyright © 2011-2022 走看看