zoukankan      html  css  js  c++  java
  • hive:数据库“行专列”操作---使用collect_set/collect_list/collect_all & row_number()over(partition by 分组字段 [order by 排序字段])

    方案一:请参考《数据库“行专列”操作---使用row_number()over(partition by 分组字段 [order by 排序字段])》,该方案是sqlserver,oracle,mysql,hive均适用的。

    在hive中的方案分为以下两种方案:

    创建测试表,并插入测试数据:

    --hive 测试 行转列 collect_set collect_list
    create table tommyduan_test(
    gridid string,
    height int,
    cell string,
    mrcount int,
    weakmrcount int
    );
    
    insert into tommyduan_test values('g1',1,'cell1',12,3);
    insert into tommyduan_test values('g1',1,'cell2',22,3);
    insert into tommyduan_test values('g1',1,'cell3',23,3);
    insert into tommyduan_test values('g1',1,'cell4',1,3);
    insert into tommyduan_test values('g1',1,'cell5',3,3);
    insert into tommyduan_test values('g1',1,'cell6',4,3);
    insert into tommyduan_test values('g1',1,'cell19',21,3);
    
    insert into tommyduan_test values('g2',1,'cell4',1,3);
    insert into tommyduan_test values('g2',1,'cell5',3,3);
    insert into tommyduan_test values('g2',1,'cell6',4,3);
    insert into tommyduan_test values('g2',1,'cell19',21,3);

    方案二:使用collect_set方案

    注意:collect_set是一个set集合,不允许重复的记录插入

    select gridid,height,collect_list(cell) cellArray,collect_list(mrcount) mrcountArray,collect_list(weakmrcount) weakmrcountArray
    from (
    select gridid,height,cell,mrcount,weakmrcount,row_number()over(partition by gridid,height order by mrcount desc) rn
    from tommyduan_test
    group by gridid,height,cell,mrcount,weakmrcount
    ) t10
    where rn<4
    group by gridid,height;
    +---------+---------+-----------------------------+---------------+-------------------+--+
    | gridid  | height  |          cellarray          | mrcountarray  | weakmrcountarray  |
    +---------+---------+-----------------------------+---------------+-------------------+--+
    | g1      | 1       | ["cell3","cell2","cell19"]  | [23,22,21]    | [3,3,3]           |
    | g2      | 1       | ["cell19","cell6","cell5"]  | [21,4,3]      | [3,3,3]           |
    +---------+---------+-----------------------------+---------------+-------------------+--+
    
    select gridid,height,
    (case when size(cellArray)>0 then cellArray[0] else '-9999' end) as cell1,
    (case when size(cellArray)>0 then mrcountArray[0] else '-9999' end) as cell1_mrcount,
    (case when size(cellArray)>0 then weakmrcountArray[0] else '-9999' end) as cell1_weakmrcount,
    (case when size(cellArray)>1 then cellArray[1] else '-9999' end) as cell2,
    (case when size(cellArray)>1 then mrcountArray[1] else '-9999' end) as cell2_mrcount,
    (case when size(cellArray)>1 then weakmrcountArray[1] else '-9999' end) as cell2_weakmrcount,
    (case when size(cellArray)>2 then cellArray[2] else '-9999' end) as cell3,
    (case when size(cellArray)>2 then mrcountArray[2] else '-9999' end) as cell3_mrcount,
    (case when size(cellArray)>2 then weakmrcountArray[2] else '-9999' end) as cell3_weakmrcount
    from 
    (
    select gridid,height,collect_list(cell) cellArray,collect_list(mrcount) mrcountArray,collect_list(weakmrcount) weakmrcountArray
    from (
    select gridid,height,cell,mrcount,weakmrcount,row_number()over(partition by gridid,height order by mrcount desc) rn
    from tommyduan_test
    group by gridid,height,cell,mrcount,weakmrcount
    ) t10
    where rn<4
    group by gridid,height
    ) t12;
    +---------+---------+---------+----------------+--------------------+--------+----------------+--------------------+---------+----------------+--------------------+--+
    | gridid  | height  |  cell1  | cell1_mrcount  | cell1_weakmrcount  | cell2  | cell2_mrcount  | cell2_weakmrcount  |  cell3  | cell3_mrcount  | cell3_weakmrcount  |
    +---------+---------+---------+----------------+--------------------+--------+----------------+--------------------+---------+----------------+--------------------+--+
    | g1      | 1       | cell3   | 23             | 3                  | cell2  | 22             | 3                  | cell19  | 21             | 3                  |
    | g2      | 1       | cell19  | 21             | 3                  | cell6  | 4              | 3                  | cell5   | 3              | 3                  |
    +---------+---------+---------+----------------+--------------------+--------+----------------+--------------------+---------+----------------+--------------------+--+

    方案三:使用collect_list/collect_all方案

    注意:collect_set是一个set集合,不允许重复的记录插入

    select gridid,height,collect_set(cell),collect_set(mrcount),collect_set(weakmrcount)
    from (select * from tommyduan_test order by gridid,height,mrcount desc) t10
    group by gridid,height;
    +---------+---------+-------------------------------------------------------------+----------------------+------+--+
    | gridid  | height  |                             _c2                             |         _c3          | _c4  |
    +---------+---------+-------------------------------------------------------------+----------------------+------+--+
    | g1      | 1       | ["cell3","cell2","cell19","cell1","cell6","cell5","cell4"]  | [23,22,21,12,4,3,1]  | [3]  |
    | g2      | 1       | ["cell19","cell6","cell5","cell4"]                          | [21,4,3,1]           | [3]  |
    +---------+---------+-------------------------------------------------------------+----------------------+------+--+
    
    select gridid,height,collect_set(cell) cellArray,collect_set(mrcount) mrcountArray,collect_set(weakmrcount) weakmrcountArray
    from (
        select gridid,height,cell,mrcount,weakmrcount,row_number()over(partition by gridid,height order by mrcount desc) rn
        from tommyduan_test
        group by gridid,height,cell,mrcount,weakmrcount
    ) t10
    where rn<4
    group by gridid,height;
    +---------+---------+-----------------------------+---------------+-------------------+--+
    | gridid  | height  |          cellarray          | mrcountarray  | weakmrcountarray  |
    +---------+---------+-----------------------------+---------------+-------------------+--+
    | g1      | 1       | ["cell3","cell2","cell19"]  | [23,22,21]    | [3]               |
    | g2      | 1       | ["cell19","cell6","cell5"]  | [21,4,3]      | [3]               |
    +---------+---------+-----------------------------+---------------+-------------------+--+
    
    select gridid,height,collect_set(concat_ws(',',cell,cast(mrcount as string), cast(weakmrcount as string))) as cellArray
    from (
    select gridid,height,cell,mrcount,weakmrcount,row_number()over(partition by gridid,height order by mrcount desc) rn
    from tommyduan_test
    group by gridid,height,cell,mrcount,weakmrcount
    ) t10
    where rn<4
    group by gridid,height
    +---------+---------+--------------------------------------------+--+
    | gridid  | height  |                 cellarray                  |
    +---------+---------+--------------------------------------------+--+
    | g1      | 1       | ["cell3,23,3","cell2,22,3","cell19,21,3"]  |
    | g2      | 1       | ["cell19,21,3","cell6,4,3","cell5,3,3"]    |
    +---------+---------+--------------------------------------------+--+
    
    
    select gridid,height,
    (case when size(cellArray)>0 then split(cellArray[0],'_')[0] else '-9999' end) as cell1,
    (case when size(cellArray)>0 then split(cellArray[0],'_')[1] else '-9999' end) as cell1_mrcount,
    (case when size(cellArray)>0 then split(cellArray[0],'_')[2] else '-9999' end) as cell1_weakmrcount,
    (case when size(cellArray)>1 then split(cellArray[1],'_')[0] else '-9999' end) as cell2,
    (case when size(cellArray)>1 then split(cellArray[1],'_')[1] else '-9999' end) as cell2_mrcount,
    (case when size(cellArray)>1 then split(cellArray[1],'_')[2] else '-9999' end) as cell2_weakmrcount,
    (case when size(cellArray)>2 then split(cellArray[2],'_')[0] else '-9999' end) as cell3,
    (case when size(cellArray)>2 then split(cellArray[2],'_')[1] else '-9999' end) as cell3_mrcount,
    (case when size(cellArray)>2 then split(cellArray[2],'_')[2] else '-9999' end) as cell3_weakmrcount
    from 
    (
    select gridid,height,collect_set(concat_ws('_',cell,cast(mrcount as string), cast(weakmrcount as string))) as cellArray
    from (
    select gridid,height,cell,mrcount,weakmrcount,row_number()over(partition by gridid,height order by mrcount desc) rn
    from tommyduan_test
    group by gridid,height,cell,mrcount,weakmrcount
    ) t10
    where rn<4
    group by gridid,height
    ) t12;
    +---------+---------+---------+----------------+--------------------+--------+----------------+--------------------+---------+----------------+--------------------+--+
    | gridid  | height  |  cell1  | cell1_mrcount  | cell1_weakmrcount  | cell2  | cell2_mrcount  | cell2_weakmrcount  |  cell3  | cell3_mrcount  | cell3_weakmrcount  |
    +---------+---------+---------+----------------+--------------------+--------+----------------+--------------------+---------+----------------+--------------------+--+
    | g1      | 1       | cell3   | 23             | 3                  | cell2  | 22             | 3                  | cell19  | 21             | 3                  |
    | g2      | 1       | cell19  | 21             | 3                  | cell6  | 4              | 3                  | cell5   | 3              | 3                  |
    +---------+---------+---------+----------------+--------------------+--------+----------------+--------------------+---------+----------------+--------------------+--+
  • 相关阅读:
    Python网络爬虫 第三章 requests进阶
    Python网络爬虫 第二章 数据解析
    Java 工具库Hutool-db数据库简单操作
    JavaScript基础
    K-Means文档聚类
    利用余弦距离比较文档间的相似度
    算法类——数学问题汇总
    基于K-Means的文本聚类
    加速国内 Github 访问,下载,的9种方案!
    为什么用MQTT而不用TCP长连接透传
  • 原文地址:https://www.cnblogs.com/yy3b2007com/p/8280511.html
Copyright © 2011-2022 走看看