zoukankan      html  css  js  c++  java
  • Hive GenericUDF2

    再来看一个分数统计的小例子。

            在Hive中存在如下一张表:

    [plain] view plain copy
     
    1. hive> describe tb_test2;  
    2. OK  
    3. name    string    
    4. score_list  array<map<string,int>>    
    5. Time taken: 0.074 seconds  
    6. hive> select * from tb_test2;  
    7. OK  
    8. A   [{"math":100,"english":90,"history":85}]  
    9. B   [{"math":95,"english":80,"history":100}]  
    10. C   [{"math":80,"english":90,"histroy":100}]  
    11. Time taken: 0.107 seconds  

            编写genericUDF.

           

    [java] view plain copy
     
    1. package com.wz.udf;  
    2. import org.apache.hadoop.io.Text;  
    3. import org.apache.hadoop.io.IntWritable;  
    4. import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;  
    5. import org.apache.hadoop.hive.ql.exec.UDFArgumentException;  
    6. import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;  
    7. import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;  
    8. import org.apache.hadoop.hive.ql.metadata.HiveException;  
    9. import org.apache.hadoop.hive.serde2.lazy.LazyString;  
    10. import org.apache.hadoop.hive.serde2.lazy.LazyMap;  
    11. import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;  
    12. import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;  
    13. import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;  
    14. import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;  
    15. import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;  
    16. import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector;  
    17. import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;  
    18. import org.apache.hadoop.hive.serde2.objectinspector.StructField;  
    19. import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;  
    20. import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;  
    21. import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;  
    22. import java.util.ArrayList;  
    23.    
    24. public class helloGenericUDFNew extends GenericUDF {  
    25.      ////输入变量定义  
    26.      private ObjectInspector nameObj;  
    27.      private ListObjectInspector listoi;  
    28.      private MapObjectInspector mapOI;  
    29.      private ArrayList<Object> valueList = new ArrayList<Object>();   
    30.      @Override  
    31.      public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {  
    32.           nameObj = (ObjectInspector)arguments[0];  
    33.           listoi = (ListObjectInspector)arguments[1];  
    34.       mapOI = ((MapObjectInspector)listoi.getListElementObjectInspector());  
    35.           //输出结构体定义  
    36.           ArrayList structFieldNames = new ArrayList();  
    37.           ArrayList structFieldObjectInspectors = new ArrayList();  
    38.           structFieldNames.add("name");  
    39.       structFieldNames.add("totalScore");  
    40.    
    41.           structFieldObjectInspectors.add( PrimitiveObjectInspectorFactory.writableStringObjectInspector );  
    42.           structFieldObjectInspectors.add( PrimitiveObjectInspectorFactory.writableIntObjectInspector );  
    43.   
    44.           StructObjectInspector si2;  
    45.           si2 = ObjectInspectorFactory.getStandardStructObjectInspector(structFieldNames, structFieldObjectInspectors);   
    46.           return si2;  
    47.      }  
    48.    
    49.      @Override  
    50.      public Object evaluate(DeferredObject[] arguments) throws HiveException{  
    51.       LazyString LName = (LazyString)(arguments[0].get());  
    52.       String strName = ((StringObjectInspector)nameObj).getPrimitiveJavaObject( LName );  
    53.   
    54.       int nelements = listoi.getListLength(arguments[1].get());  
    55.           int nTotalScore=0;  
    56.           valueList.clear();  
    57.           //遍历list  
    58.       for(int i=0;i<nelements;i++)  
    59.       {   
    60.                LazyMap LMap = (LazyMap)listoi.getListElement(arguments[1].get(),i);  
    61.                //获取map中的所有value值  
    62.            valueList.addAll(mapOI.getMap(LMap).values());   
    63.                for (int j = 0; j < valueList.size(); j++)  
    64.            {  
    65.                    nTotalScore+=Integer.parseInt(valueList.get(j).toString());  
    66.                }                 
    67.           }  
    68.       Object[] e;     
    69.       e = new Object[2];  
    70.       e[0] = new Text(strName);  
    71.           e[1] = new IntWritable(nTotalScore);  
    72.           return e;  
    73.      }  
    74.    
    75.      @Override  
    76.      public String getDisplayString(String[] children) {  
    77.           assert( children.length>0 );  
    78.    
    79.           StringBuilder sb = new StringBuilder();  
    80.           sb.append("helloGenericUDFNew(");  
    81.           sb.append(children[0]);  
    82.           sb.append(")");  
    83.    
    84.           return sb.toString();  
    85.      }  
    86. }  

            在Hive中执行,结果如下:

            

    [plain] view plain copy
     
    1. hive> add jar /home/wangzhun/hive/hive-0.8.1/lib/helloGenericUDFNew.jar;      
    2. Added /home/wangzhun/hive/hive-0.8.1/lib/helloGenericUDFNew.jar to class path  
    3. Added resource: /home/wangzhun/hive/hive-0.8.1/lib/helloGenericUDFNew.jar  
    4. hive> create temporary function hellonew as 'com.wz.udf.helloGenericUDFNew';  
    5. OK  
    6. Time taken: 0.016 seconds  
    7. hive> select hellonew(tb_test2.name,tb_test2.score_list) from tb_test2;       
    8. Total MapReduce jobs = 1  
    9. Launching Job 1 out of 1  
    10. Number of reduce tasks is set to 0 since there's no reduce operator  
    11. Starting Job = job_201312091733_0018, Tracking URL = http://localhost:50030/jobdetails.jsp?jobid=job_201312091733_0018  
    12. Kill Command = /home/wangzhun/hadoop/hadoop-0.20.2/bin/../bin/hadoop job  -Dmapred.job.tracker=localhost:9001 -kill job_201312091733_0018  
    13. Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0  
    14. 2013-12-09 22:31:22,328 Stage-1 map = 0%,  reduce = 0%  
    15. 2013-12-09 22:31:25,354 Stage-1 map = 100%,  reduce = 0%  
    16. 2013-12-09 22:31:28,390 Stage-1 map = 100%,  reduce = 100%  
    17. Ended Job = job_201312091733_0018  
    18. MapReduce Jobs Launched:   
    19. Job 0: Map: 1   HDFS Read: 99 HDFS Write: 18 SUCESS  
    20. Total MapReduce CPU Time Spent: 0 msec  
    21. OK  
    22. {"people":"A","totalscore":275}  
    23. {"people":"B","totalscore":275}  
    24. {"people":"C","totalscore":270}  
    25. Time taken: 21.7 seconds  
  • 相关阅读:
    检索 COM 类工厂中 CLSID 为 {00024500-0000-0000-C000-000000000046} 的组件失败,原因是出现以下错误: 80070005 拒绝访问。 (异常来自 HRESULT:0x80070005 (E_ACCESSDENIED))。
    SAP middb主键加索引
    【深入理解JVM】类加载器与双亲委派模型 (转)
    一千行MySQL学习笔记 (转)
    Spring框架是怎么解决Bean之间的循环依赖的 (转)
    一步一步带你入门MySQL中的索引和锁 (转)
    如何决定使用 HashMap 还是 TreeMap? (转)
    JVM 线上故障排查基本操作 (转)
    分布式、集群、微服务的区别
    可能是把 Java 内存区域讲的最清楚的一篇文章
  • 原文地址:https://www.cnblogs.com/cxhfuujust/p/7553123.html
Copyright © 2011-2022 走看看