zoukankan      html  css  js  c++  java
  • Hive自定义UDF

    1、添加依赖jar
    <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
        xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
        <modelVersion>4.0.0</modelVersion>
    
        <groupId>com.mengyao.dataformat</groupId>
        <artifactId>hortonworks</artifactId>
        <version>0.0.1-SNAPSHOT</version>
        <packaging>jar</packaging>
    
        <name>hortonworks</name>
        <url>http://maven.apache.org</url>
    
        <repositories>
            <!-- hortonworks -->
            <repository>
                <releases>
                    <enabled>true</enabled>
                    <updatePolicy>always</updatePolicy>
                    <checksumPolicy>warn</checksumPolicy>
                </releases>
                <snapshots>
                    <enabled>false</enabled>
                    <updatePolicy>never</updatePolicy>
                    <checksumPolicy>fail</checksumPolicy>
                </snapshots>
                <id>HDPReleases</id>
                <name>HDP Releases</name>
                <url>http://repo.hortonworks.com/content/repositories/releases/</url>
                <layout>default</layout>
            </repository>
            <!-- cloudera -->
            <!-- 
            <repository> 
                <id>cloudera</id> 
                <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url> 
            </repository>
             -->
        </repositories>
    
        <properties>
            <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
            <junit.version>4.10</junit.version>
            <hortonworks.hadoop.version>2.7.1.2.3.2.0-2950</hortonworks.hadoop.version>
            <hortonworks.hive.version>1.2.1.2.3.2.0-2950</hortonworks.hive.version>
            <slf4j.version>1.7.10</slf4j.version>
        </properties>
    
        <dependencies>
            <dependency>
                <groupId>junit</groupId>
                <artifactId>junit</artifactId>
                <version>${junit.version}</version>
                <scope>test</scope>
            </dependency>
            <dependency>
                <groupId>jdk.tools</groupId>
                <artifactId>jdk.tools</artifactId>
                <version>1.7</version>
                <scope>system</scope>
                <systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>
            </dependency>
            <dependency>
                <groupId>org.mortbay.jetty</groupId>
                <artifactId>jetty</artifactId>
                <version>6.1.26</version>
            </dependency>
    
            <!-- HortonWorks Hadoop -->
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-common</artifactId>
                <version>${hortonworks.hadoop.version}</version>
            </dependency>
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-hdfs</artifactId>
                <version>${hortonworks.hadoop.version}</version>
            </dependency>
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-mapreduce-client-core</artifactId>
                <version>${hortonworks.hadoop.version}</version>
            </dependency>
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
                <version>${hortonworks.hadoop.version}</version>
            </dependency>
            <dependency>
                <groupId>org.apache.hadoop</groupId>
                <artifactId>hadoop-mapreduce-client-common</artifactId>
                <version>${hortonworks.hadoop.version}</version>
            </dependency>
    
            <!-- Hortonworks Hive -->
            <dependency>
                <groupId>org.apache.hive</groupId>
                <artifactId>hive-jdbc</artifactId>
                <version>${hortonworks.hive.version}</version>
            </dependency>
            <dependency>
                <groupId>org.apache.hive</groupId>
                <artifactId>hive-exec</artifactId>
                <version>${hortonworks.hive.version}</version>
            </dependency>
    
            <!-- slf4j -->
            <dependency>
                <groupId>org.slf4j</groupId>
                <artifactId>slf4j-api</artifactId>
                <version>${slf4j.version}</version>
            </dependency>
            <dependency>
                <groupId>org.slf4j</groupId>
                <artifactId>slf4j-log4j12</artifactId>
                <version>${slf4j.version}</version>
            </dependency>
    
        </dependencies>
    </project>
    
    
    2、自定义Hive的UDF函数
    package com.mengyao.hadoop.hortonworks.hive.udf;
    
    import java.io.DataInput;
    import java.io.DataOutput;
    import java.io.IOException;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    import org.apache.hadoop.hive.ql.exec.UDF;
    import org.apache.hadoop.io.Text;
    import org.apache.hadoop.io.Writable;
    
    public class AddrSplitUDF extends UDF {
    
        public static class AddrBean implements Writable {
            private String province;
            private String city;
            private String region;
            private String county;
            private String street;
            private String road;
            private String other;
            private String make;
            
            @Override
            public void readFields(DataInput in) throws IOException {
                this.province = in.readUTF();
                this.city = in.readUTF();
                this.region = in.readUTF();
                this.county = in.readUTF();
                this.street = in.readUTF();
                this.road = in.readUTF();
                this.other = in.readUTF();
                this.make = in.readUTF();
            }
    
            @Override
            public void write(DataOutput out) throws IOException {
                out.writeUTF(province);
                out.writeUTF(city);
                out.writeUTF(region);
                out.writeUTF(county);
                out.writeUTF(street);
                out.writeUTF(road);
                out.writeUTF(other);
                out.writeUTF(make);            
            }
            public AddrBean(){            
            }
            public void set(String province, String city, String region, String county, String street, String road, String other, String make) {                
                this.province = province;
                this.city = city;
                this.region = region;
                this.county = county;
                this.street = street;
                this.road = road;
                this.other = other;
                this.make = make;
            }
            public String getProvince() {
                return province;
            }
            public void setProvince(String province) {
                this.province = province;
            }
            public String getCity() {
                return city;
            }
            public void setCity(String city) {
                this.city = city;
            }
            public String getRegion() {
                return region;
            }
            public void setRegion(String region) {
                this.region = region;
            }
            public String getCounty() {
                return county;
            }
            public void setCounty(String county) {
                this.county = county;
            }
            public String getStreet() {
                return street;
            }
            public void setStreet(String street) {
                this.street = street;
            }
            public String getRoad() {
                return road;
            }
            public void setRoad(String road) {
                this.road = road;
            }
            public String getOther() {
                return other;
            }
            public void setOther(String other) {
                this.other = other;
            }
            public String getMake() {
                return make;
            }
            public void setMake(String make) {
                this.make = make;
            }
            @Override
            public String toString() {
                return province + "\t" + city + "\t" + region + "\t" + county + "\t" + street + "\t" + road
                        + "\t" + other + "\t " + make;
            }
        }
        
        public static AddrBean splitAddr(String addrStr) {
            Pattern mpattern = Pattern.compile("(((.*省)|(.*市)|(.*区)|(.*县)|(.*街)|(.*路)).*?|.*)");
            Matcher mmatcher = mpattern.matcher(addrStr);
            String str = "";
            AddrBean addr = new AddrBean();
            while (mmatcher.find()) {
                str = mmatcher.group();
                if (str.length() > 0) {
                    if (str.endsWith("省")) {
                        addr.setProvince(str);
                    } else if (str.endsWith("市")) {
                        addr.setCity(str);
                    } else if (str.endsWith("区")) {
                        addr.setRegion(str);
                    } else if (str.endsWith("县")) {
                        addr.setCounty(str);
                    } else if (str.endsWith("街")) {
                        addr.setStreet(str);
                    } else if (str.endsWith("路")) {
                        addr.setRoad(str);
                    } else {
                        addr.setOther(str);
                    }
                }
            }
            
            return addr;
        }
        
        public Text evaluate(final Text addr){
            if (null == addr) {
                return null;
            }
            AddrBean splitAddr = splitAddr(addr.toString());
            if (null == splitAddr) {
                return null;
            }
            
            return new Text(splitAddr.toString());
        }
        
    }
    3、打包为Jar File:splitAddr-0.0.1-SNAPSHOT.jar
    
    
    4、注册自定义UDF到hive会话中(临时)
        4.1:在hive的会话中添加临时函数
            hive
            hive> add jar /home/rta/mengyao/apps/udf/splitAddr-0.0.1-SNAPSHOT.jar;
            Added [/home/rta/mengyao/apps/udf/splitAddr-0.0.1-SNAPSHOT.jar] to class path
            Added resources: [/home/rta/mengyao/apps/udf/splitAddr-0.0.1-SNAPSHOT.jar]
            hive> create temporary function splitAddr as 'com.mengyao.hadoop.hortonworks.hive.udf.AddrSplitUDF';
            OK
            Time taken: 0.444 seconds
            验证:select splitAddr(shop_addr) from rtc_nuomi limit 10;
            
        4.2:运行hive时加载配置文件
            vim init_func_splitAddr
                add jar /home/rta/mengyao/apps/udf/splitAddr-0.0.1-SNAPSHOT.jar;
                create temporary function splitAddr as 'com.mengyao.hadoop.hortonworks.hive.udf.AddrSplitUDF';
            hive -i init_func_splitAddr
            验证:select splitAddr(shop_addr) from rtc_nuomi limit 10;
  • 相关阅读:
    【FICO系列】SAP FICO 基本概念
    【MM系列】SAP 的库存管理
    【ABAP系列】SAP ABAP WRITE字段隐藏的方法
    【BASIS系列】SAP 批量锁住用户和TCODE的方法
    【HANA系列】SAP HANA 1.0 SPS 11 新特性
    【HANA系列】SAP HANA启动出现ERROR
    瓣呀,一个开源项目
    javascript入门笔记6-内置对象
    javascript入门笔记5-事件
    javascript入门笔记4-数组
  • 原文地址:https://www.cnblogs.com/mengyao/p/5099603.html
Copyright © 2011-2022 走看看