zoukankan      html  css  js  c++  java
  • hive 在统计汇总时候,如何去重

    create  table xxxx.test_collect_list_set(
    
    first_level_directory int ,
    second_level_directory int ,
    third_level_directory int ,
    order_id int 
    
    )
    
    ROW FORMAT DELIMITED 
      FIELDS TERMINATED BY '	' 
      NULL DEFINED AS '' 
    STORED AS ORC
    tblproperties ('orc.compress'='SNAPPY');
    
    
    insert overwrite into xxxx.test_collect_list_set
    values
    (1,2,3,1),
    (1,2,3,1),
    (1,2,3,4),
    (1,2,4,1),
    (1,2,4,5)
    
    ;
    
    
    select first_level_directory,second_level_directory,collect_set(set_list) from ( 
    select first_level_directory,
           second_level_directory,
           third_level_directory ,
           concat_ws(',',collect_set(order_id ))  set_list 
    from xxxx.test_collect_list_set
    group by first_level_directory,second_level_directory,third_level_directory
    ) a
    GROUP by first_level_directory,second_level_directory
    
    Error:
    Caused by: java.lang.ClassCastException: java.util.ArrayList cannot be cast to org.apache.hadoop.io.IntWritable
    
    
    drop table xxxx.test_collect_list_set ;
    
    
    
    SQL :
    select first_level_directory,second_level_directory,udf(concat_ws(',',collect_set(set_list))) as set_list,udaf(udaf_set) as udaf_set from ( 
    select first_level_directory,
           second_level_directory,
           third_level_directory , 
           concat_ws(',',collect_set(cast(order_id  as string))) set_list ,
    	   udaf(order_id)  udaf_set 
    from xxxx.test_collect_list_set
    group by first_level_directory,second_level_directory,third_level_directory
    ) a
    GROUP by first_level_directory,second_level_directory;
    
    Result :
    1	2	1,4,1,5	["1,4","1,5"]
    
    TIps :
    
    I want to remove the duplicted data count from the back 
    the data should be become to as froming 
    1,4,1,5   ---> 1,4,5    (do not care about the sort)
    
    May be the bast mothod is write a udf funciton using java method to remove he duplicted data 
    
    The other way is more complicated which suppliey by my colleges ,more info please refer to 
    https://blog.csdn.net/inte_sleeper/article/details/7196114
    
    My another colleges give me this sql ,it works fine alone ,but if we add more columns it will give some errors 
    
    select collect_set(col) from (
    select explode(split('1,2,3,4,1,2',',')) as col from test 
    ) a;
    
    
    Also we can using udaf to finish this work ,as talked with college it seems more easy for sql developers
    
    
    errors like :
    UDTF's are not supported outside the SELECT clause, nor nested in expressions
    
    
    
    
    
    
    

  • 相关阅读:
    DataAdapter的Fill方法(转)
    C#读取xml节点数据方法小结
    Service-Oriented Architecture,SOA(转)
    SOAP详解(转)
    WebService到底是什么?(转)
    只需要2个工具,百度云盘大文件就能用迅雷和IDM下载
    刘铁猛-深入浅出WPF-系列资源汇总
    WCF书籍
    书籍:wpf学习书籍介绍
    在与 SQL Server 建立连接时出现与网络相关的或特定于实例的错误。未找到或无法访问服务器(转)
  • 原文地址:https://www.cnblogs.com/TendToBigData/p/10501185.html
Copyright © 2011-2022 走看看