zoukankan      html  css  js  c++  java
  • Hive自定义UDTF

    需求:
        原始数据为
            F8    楼梯    MULTIPOLYGON(((27.7363472618163 -76.4068597634723,27.7363472618163 -80.0131241459239,19.72894019261 -80.0118552827063,19.502938400954 -76.4055550890202,27.7363472618163 -76.4068597634723)))
        我自定义的UDTF需要把POI数据使用0、1;1、2、2、3;3、4...规则多行输出,如下:
            F8    楼梯    27.7363472618163    -76.4068597634723    27.7363472618163    -80.0131241459239
            F8    楼梯    27.7363472618163    -80.0131241459239    19.72894019261     -80.0118552827063
            F8    楼梯    19.72894019261     -80.0118552827063    19.502938400954     -76.4055550890202
            F8    楼梯    19.502938400954     -76.4055550890202    27.7363472618163 -76.4068597634723
    
            
    如下使用自定义UDTF实现        
    1、添加依赖jar
    <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
        xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
        <modelVersion>4.0.0</modelVersion>
    
        <groupId>com.mengyao.dataformat</groupId>
        <artifactId>hortonworks</artifactId>
        <version>0.0.1-SNAPSHOT</version>
        <packaging>jar</packaging>
    
        <name>hortonworks</name>
        <url>http://maven.apache.org</url>
    
        <repositories>
            <!-- hortonworks -->
            <repository>
                <releases>
                    <enabled>true</enabled>
                    <updatePolicy>always</updatePolicy>
                    <checksumPolicy>warn</checksumPolicy>
                </releases>
                <snapshots>
                    <enabled>false</enabled>
                    <updatePolicy>never</updatePolicy>
                    <checksumPolicy>fail</checksumPolicy>
                </snapshots>
                <id>HDPReleases</id>
                <name>HDP Releases</name>
                <url>http://repo.hortonworks.com/content/repositories/releases/</url>
                <layout>default</layout>
            </repository>
            <!-- cloudera -->
            <!--
            <repository>
                <id>cloudera</id>
                <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
            </repository>
             -->
        </repositories>
    
        <properties>
            <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
            <junit.version>4.10</junit.version>
            <hortonworks.hadoop.version>2.7.1.2.3.2.0-2950</hortonworks.hadoop.version>
            <hortonworks.hive.version>1.2.1.2.3.2.0-2950</hortonworks.hive.version>
            <slf4j.version>1.7.10</slf4j.version>
        </properties>
    
        <dependencies>
            <dependency>
                <groupId>junit</groupId>
                <artifactId>junit</artifactId>
                <version>${junit.version}</version>
                <scope>test</scope>
            </dependency>
            <dependency>
                <groupId>jdk.tools</groupId>
                <artifactId>jdk.tools</artifactId>
                <version>1.7</version>
                <scope>system</scope>
                <systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>
            </dependency>
            <dependency>
                <groupId>org.mortbay.jetty</groupId>
                <artifactId>jetty</artifactId>
                <version>6.1.26</version>
            </dependency>
    
            <!-- HortonWorks Hadoop -->
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-common</artifactId>
                <version>${hortonworks.hadoop.version}</version>
            </dependency>
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-hdfs</artifactId>
                <version>${hortonworks.hadoop.version}</version>
            </dependency>
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-mapreduce-client-core</artifactId>
                <version>${hortonworks.hadoop.version}</version>
            </dependency>
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
                <version>${hortonworks.hadoop.version}</version>
            </dependency>
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-mapreduce-client-common</artifactId>
                <version>${hortonworks.hadoop.version}</version>
            </dependency>
    
            <!-- Hortonworks Hive -->
            <dependency>
                <groupId>org.apache.hive</groupId>
                <artifactId>hive-jdbc</artifactId>
                <version>${hortonworks.hive.version}</version>
            </dependency>
            <dependency>
                <groupId>org.apache.hive</groupId>
                <artifactId>hive-exec</artifactId>
                <version>${hortonworks.hive.version}</version>
            </dependency>
    
            <!-- slf4j -->
            <dependency>
                <groupId>org.slf4j</groupId>
                <artifactId>slf4j-api</artifactId>
                <version>${slf4j.version}</version>
            </dependency>
            <dependency>
                <groupId>org.slf4j</groupId>
                <artifactId>slf4j-log4j12</artifactId>
                <version>${slf4j.version}</version>
            </dependency>
    
        </dependencies>
    </project>
    
    
    2、自定义UDTF
    package com.mengyao.hadoop.hortonworks.hive.udf;
    
    import java.util.ArrayList;
    
    import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
    import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
    import org.apache.hadoop.hive.ql.metadata.HiveException;
    import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF;
    import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
    import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
    import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
    import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
    
    public class Udf1 extends GenericUDTF {
        
        @Override
        public StructObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException {
            if (args.length!=3) {
                throw new UDFArgumentLengthException("参数小于3个字段");
            }
            if (args[0].getCategory() != ObjectInspector.Category.PRIMITIVE) {
                throw new UDFArgumentException("参数不是基本类型");
            }
            ArrayList<String> fieldNames = new ArrayList<String>();
            ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
            fieldNames.add("floor");
            fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
            fieldNames.add("poiName");
            fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
            fieldNames.add("x1");
            fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
            fieldNames.add("y1");
            fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
            fieldNames.add("x2");
            fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
            fieldNames.add("y2");
            fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector);
    
            return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames,fieldOIs);
        }
    
        @Override
        public void close() throws HiveException {
        }
    
        @Override
        public void process(Object[] args) throws HiveException {
            final String floor = args[0].toString();
            final String poiNameCn = args[1].toString();
            final String geom = args[2].toString();
            String input = geom.replace("MULTIPOLYGON(((", "").replace(")))", "");
            //5
            String[] fields = input.split(",");
            for(int i=0; i<fields.length; i++) {
                try {
                    String[] arr = new String[6];
                    arr[0]=floor;
                    arr[1]=poiNameCn;
                    String[] tmp1 = fields[i].split(" ");
                    arr[2]=tmp1[0];
                    arr[3]=tmp1[1];
                    if (i+1<fields.length) {
                        String[] tmp2 = fields[i+1].split(" ");
                        arr[4]=tmp2[0];
                        arr[5]=tmp2[1];
                    }
                    forward(arr);
                } catch (Exception e) {
                    continue;
                }
            }
        }
        
    }
    
    
    3、原始数据
    hive> select floor_id,poi_name_cn,geom from tmp_source_poi limit 10;
    OK
    F8    楼梯    MULTIPOLYGON(((27.7363472618163 -76.4068597634723,27.7363472618163 -80.0131241459239,19.72894019261 -80.0118552827063,19.502938400954 -76.4055550890202,27.7363472618163 -76.4068597634723)))
    F8    避风塘    MULTIPOLYGON(((142.38873291015625 -89.830322265625,142.38848876953125 -92.31036376953125,142.3878173828125 -95.64105224609375,151.43511962890625 -95.62530517578125,151.4322509765625 -114.43475341796875,145.563720703125 -114.43389892578125,140.51824951171875 -114.43310546875,140.51849365234375 -111.59051513671875,140.52069091796875 -98.031005859375,125.305908203125 -98.02862548828125,125.3070068359375 -92.30804443359375,128.85321044921875 -92.30859375,138.40338134765625 -92.309814453125,138.40386962890625 -89.82965087890625,142.38873291015625 -89.830322265625)))
    F8    咖啡时间    MULTIPOLYGON(((73.9320653229952 -73.1165769445031,73.9267330393195 -99.5226487368938,60.0283501408994 -99.5200368016607,42.2555900737643 -99.5584639401271,42.2556011453271 -95.2023675085028,39.3781932592392 -95.2019115369498,38.6626852564514 -95.2017981567676,38.6649185493588 -81.1082055937875,41.4105622805655 -81.108640672537,41.4118176437914 -73.1864483631389,73.9320653229952 -73.1165769445031)))
    F8    电梯    MULTIPOLYGON(((27.7363472618163 -73.7509837455356,27.7363472618163 -76.4068597521423,27.0886035524309 -76.4067571138262,19.502938400954 -76.4055550890202,19.336468026042 -73.7496526911885,27.7363472618163 -73.7509837455356)))
    F8    电梯    MULTIPOLYGON(((71.791599329561 -120.789716028472,71.7846846170723 -123.743601208178,63.5224315151572 -123.589247200859,63.5290417298674 -120.765438285686,71.791599329561 -120.789716028472)))
    F8    电梯    MULTIPOLYGON(((151.438877351582 -71.140565793033,143.380713831633 -71.1289797308272,143.41783554107 -67.7106754227564,151.439391598105 -67.8952907441503,151.438877351582 -71.140565793033)))
    F8    自动扶梯    MULTIPOLYGON(((129.16515929997 -72.9842409176565,129.16515929997 -69.2177793933323,136.625542271882 -69.2177793933323,136.599083889276 -72.9842409176565,129.16515929997 -72.9842409176565)))
    F8    自动扶梯    MULTIPOLYGON(((90.0243225693702 -73.1198265228713,90.0237087868154 -76.1593672441145,83.4253928624093 -76.2595438690263,83.4661638364195 -73.1185022099924,90.0243225693702 -73.1198265228713)))
    F8    卡乐时代    MULTIPOLYGON(((56.796247374266386 -117.50865583121777,56.6748490780592 -117.49650815501809,48.22472807019949 -117.49516914784908,48.26815593987703 -123.26608142256737,47.20688885077834 -123.24331614375114,47.17912720888853 -121.70872777327895,23.973091162741184 -120.98440674319863,22.212835498154163 -95.15799333155155,27.736347261816263 -95.15799333155155,27.736347261816263 -97.24820585176349,28.933990716934204 -97.24888809770346,29.609365351498127 -97.24910343065858,39.72476739436388 -97.26569728925824,39.74184823036194 -100.99478274583817,56.65072625130415 -100.9862755574286,56.796247374266386 -117.50865583121777)))
    F8    一味一诚    MULTIPOLYGON(((104.10507772862911 -113.762516528368,104.10343369841576 -124.07710339874029,71.78468461707234 -123.74360121414065,71.80045375600457 -117.0072059892118,80.00307461991906 -117.11536817625165,80.00307461991906 -111.21669100224972,85.23863035440445 -113.57487217336893,104.10507772862911 -113.762516528368)))
    Time taken: 0.052 seconds, Fetched: 10 row(s)
    hive>
    
    
    4、执行UDTF验证结果
    hive> select poipoint(floor_id,poi_name_cn,geom) as (floor,poiName,x1,y1,x2,y2) from tmp_source_poi limit 10;
    OK
    F8    楼梯    27.7363472618163    -76.4068597634723    27.7363472618163    -80.0131241459239
    F8    楼梯    27.7363472618163    -80.0131241459239    19.72894019261    -80.0118552827063
    F8    楼梯    19.72894019261    -80.0118552827063    19.502938400954    -76.4055550890202
    F8    楼梯    19.502938400954    -76.4055550890202    27.7363472618163    -76.4068597634723
    F8    楼梯    27.7363472618163    -76.4068597634723    NULL    NULL
    F8    避风塘    142.38873291015625    -89.830322265625    142.38848876953125    -92.31036376953125
    F8    避风塘    142.38848876953125    -92.31036376953125    142.3878173828125    -95.64105224609375
    F8    避风塘    142.3878173828125    -95.64105224609375    151.43511962890625    -95.62530517578125
    F8    避风塘    151.43511962890625    -95.62530517578125    151.4322509765625    -114.43475341796875
    F8    避风塘    151.4322509765625    -114.43475341796875    145.563720703125    -114.43389892578125
    Time taken: 0.059 seconds, Fetched: 10 row(s)
    hive>
  • 相关阅读:
    poj1837 Balance

    字符流
    字节流
    File类
    this和static
    异常
    接口
    抽象类
    多态
  • 原文地址:https://www.cnblogs.com/mengyao/p/5195116.html
Copyright © 2011-2022 走看看