zoukankan      html  css  js  c++  java
  • hive操作实例

    word count

    select word,count(*)
    from(
    select
    explode(split(sentence,' '))
    as word
    from article
    )t
    group by word;

    split:数据切分为数组

    explode:行转列

    统计每个用户购买过多少个商品

    建表

    create table orders(
    order_id string,
    user_id string,
    eval_set string,
    order_number string,
    order_dow string,
    order_hour_of_day string,
    days_since_prior_order string)
    row format delimited fields terminated by ',';
    
    create table order_products_prior(
    order_id string,
    product_id string,
    add_to_cart_order string,
    reordered string)
    row format delimited fields terminated by ',';

    --1.统计每个订单有多少个商品:

    select order_id,count(1) as prod_cnt
    from order_products_prior
    group by order_id
    order by prod_cnt desc
    limit 10;

    --2.对每个用户下过的订单对应的商品数目求和:

    select user_id,sum(prod_cnt) as prod_sum
    from orders od
    join(
    select order_id,count(1) as prod_cnt
    from order_products_prior
    group by order_id)pro
    on (od.order_id=pro.order_id)
    group by od.user_id desc
    order by prod_sum
    limit 10;

    每个用户最喜爱购买的三个product是什么,最终表结构可以是3个列,或者是一个字符串

    (数据表和上个例子相同)

    --1.统计每个用户和每种商品产生的行为数量

    select user_id,product_id,count(1) as prod_buy_cnt
    from orders t1
    join order_products_prior t2 
    on t1.order_id=t2.order_id
    group by user_id,product_id
    limit 20;

    --2.对一个用户不同商品,按照购买次数进行降序(desc)排列

    select user_id,product_id,prod_buy_cnt,
    row_number() over(partition by user_id order by prod_buy_cnt desc) as row_num
    from(
        select user_id,product_id,count(1) as prod_buy_cnt
        from orders t1
        join order_products_prior t2 
        on t1.order_id=t2.order_id
        group by user_id,product_id
    )t12
    limit 20;

    得到的结果类似于:

    user1  product1  1

    user1  product2  2

    user1  product3  3

    user2  product1  1

    user2  product2  2

    user2  product3  3

    --3.取每个用户购买次数最多的3个商品列转行

    select user_id,collect_list(concat_ws('_',product_id,cast(row_num as string))) as top_3_prods
    from
    (
    select user_id,product_id,prod_buy_cnt,
    row_number() over(partition by user_id order by prod_buy_cnt desc) as row_num
    from(
    select user_id,product_id,count(1) as prod_buy_cnt
    from orders t1
    join order_products_prior t2 
    on t1.order_id=t2.order_id
    group by user_id,product_id
    )t12
    )t
    where row_num<4
    group by user_id
    limit 20;

    每个用户最喜爱购买的前10%个product是什么

    select user_id,collect_list(concat_ws('_',product_id,cast(rk as string),cast(prod_cate_cnt as string))) as top_10_prod
    from
    (
      select
      user_id,product_id,usr_prod_cnt,
      row_number() over(distribute by user_id sort by usr_prod_cnt desc) as rk,
      ceil(cast(count(1) over(partition by user_id) as double)*0.1) as prod_cate_cnt
      --ceil(cast(sum(usr_prod_cnt) over(partition by user_id) as double)*0.1) as total_prod_cnt
      from
      (
        select user_id,product_id,count(1) as usr_prod_cnt
        from orders 
        join 
        order_products_prior pri
        on orders.order_id=pri.order_id
        group by user_id,product_id
      )t1
    )t
    where rk<=prod_cate_cnt
    group by user_id
    limit 100;

    建分区表,orders表按照order_dow建立分区表orders_part,然后从hive查询orders动态插入orders_part表中

    1.建立分区表

    create table order_part(
    order_id string,
    user_id string,
    eval_set string,
    order_number string,
    order_hour_of_day string,
    days_since_prior_order string
    )partitioned by(order_dow string)
    row format delimited fields terminated by '	';

    2.动态插入分区表

    set hive.exec.dynamic.partition=true; --使用动态分区
    set hive.exec.dynamic.partition.mode=nonstrict; --使用无限制模式
    
    insert overwrite table order_part partition(order_dow) --(dt='20190512')
    select order_id,user_id,eval_set,order_number,order_hour_of_day,days_since_prior_order,order_dow 
    from orders --where order_dow='2'
  • 相关阅读:
    Autofac(01)
    深入理解ADO.NET Entity Framework(02)
    使用excel 数据透视表画图
    C# 控制CH341进行SPI,I2C读写
    C# winform使用combobox遍历文件夹内所有文件
    通用分页存储过程
    如何让你的SQL运行得更快
    sql优化之使用索引
    SQL优化
    SQL 循环语句几种写法
  • 原文地址:https://www.cnblogs.com/xumaomao/p/12621472.html
Copyright © 2011-2022 走看看