zoukankan      html  css  js  c++  java
  • 开发一个dremio user_agent 解析函数

    参考apache drill 实现一个user_agent 解析的函数

    项目结构

    • maven 项目结构
     
    ├── pom.xml
    ├── src
    ├── main
    ├── java
    ├── com
    └── dalong
    └── udf
    ├── MyFunc.java
    ├── UAAPP.java
    └── UserAgentAnalyzerProvider.java
    └── helper
    └── resources
    └── sabot-module.conf
     
    • 代码说明
      sabot-module.conf 老样子配置包扫描
     
    dremio.classpath.scanning.packages += com.dalong.udf
    • pom.xml
      主要是user agent 解析依赖包的添加,以及maven-shade-plugin 插件的配置
     
    <?xml version="1.0" encoding="UTF-8"?>
    <project xmlns="http://maven.apache.org/POM/4.0.0"
             xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
             xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
        <modelVersion>4.0.0</modelVersion>
     
        <groupId>com.dalong</groupId>
        <artifactId>dremio-func</artifactId>
        <version>2.0-SNAPSHOT</version>
     
        <properties>
            <maven.compiler.source>8</maven.compiler.source>
            <maven.compiler.target>8</maven.compiler.target>
            <version.dremio>13.0.0-202101272034330307-20fb9275</version.dremio>
        </properties>
     
        <dependencies>
            <dependency>
                <groupId>com.dremio.sabot</groupId>
                <artifactId>dremio-sabot-kernel</artifactId>
                <version>${version.dremio}</version>
                <scope>provided</scope>
            </dependency>
            <dependency>
                <groupId>com.dremio.sabot</groupId>
                <artifactId>dremio-sabot-kernel</artifactId>
                <version>${version.dremio}</version>
                <classifier>tests</classifier>
                <scope>test</scope>
            </dependency>
            <dependency>
                <!-- Mockito needs to be on the class path after JUnit (or Hamcrest)
                  as long as Mockito _contains_ older Hamcrest classes. -->
                <groupId>org.mockito</groupId>
                <artifactId>mockito-core</artifactId>
                <scope>test</scope>
                <version>1.10.19</version>
            </dependency>
            <dependency>
                <groupId>org.hamcrest</groupId>
                <artifactId>hamcrest-all</artifactId>
                <scope>test</scope>
                <version>1.3</version>
            </dependency>
            <dependency>
                <groupId>junit</groupId>
                <artifactId>junit</artifactId>
                <scope>test</scope>
                <version>4.12</version>
            </dependency>
            <dependency>
                <groupId>com.dremio</groupId>
                <artifactId>dremio-common</artifactId>
                <classifier>tests</classifier>
                <version>${version.dremio}</version>
                <scope>test</scope>
            </dependency>
            <dependency>
                <groupId>net.sf.uadetector</groupId>
                <artifactId>uadetector-resources</artifactId>
                <version>2014.04</version>
            </dependency>
            <dependency>
                <groupId>nl.basjes.parse.useragent</groupId>
                <artifactId>yauaa</artifactId>
                <version>5.9</version>
            </dependency>
        </dependencies>
     
        <repositories>
            <repository>
                <id>dremio-free</id>
                <url>http://maven.dremio.com/free/</url>
            </repository>
            <repository>
                <id>dremio-public</id>
                <url>http://maven.dremio.com/public/</url>
            </repository>
        </repositories>
        <build>
            <plugins>
                <plugin>
                    <groupId>org.apache.maven.plugins</groupId>
                    <artifactId>maven-shade-plugin</artifactId>
                    <version>3.2.3</version>
                    <executions>
                        <execution>
                            <phase>package</phase>
                            <goals>
                                <goal>shade</goal>
                            </goals>
                            <configuration>
                                <artifactSet>
                                    <includes>
                                        <include>nl.basjes.parse.useragent:yauaa</include>
                                        <include>nl.basjes.collections:prefixmap</include>
                                        <include>org.apache.commons:commons-text</include>
                                        <include>org.apache.commons:commons-collections4</include>
                                    </includes>
                                </artifactSet>
                            </configuration>
                        </execution>
                    </executions>
                </plugin>
                <plugin>
                    <groupId>org.apache.maven.plugins</groupId>
                    <artifactId>maven-source-plugin</artifactId>
                    <version>3.2.1</version>
                    <executions>
                        <execution>
                            <id>attach-sources</id>
                            <phase>package</phase>
                            <goals>
                                <goal>jar-no-fork</goal>
                            </goals>
                        </execution>
                    </executions>
                </plugin>
            </plugins>
        </build>
    </project>
    • 核心代码
      UAAPP.java
     
    package com.dalong.udf;
     
    import com.dremio.common.expression.CompleteType;
    import com.dremio.common.expression.LogicalExpression;
    import com.dremio.exec.expr.SimpleFunction;
    import com.dremio.exec.expr.annotations.FunctionTemplate;
    import com.dremio.exec.expr.annotations.Output;
    import com.dremio.exec.expr.annotations.Param;
    import com.dremio.exec.expr.annotations.Workspace;
    import com.dremio.exec.expr.fn.OutputDerivation;
    import org.apache.arrow.memory.ArrowBuf;
    import org.apache.arrow.vector.complex.writer.BaseWriter.ComplexWriter;
    import org.apache.arrow.vector.holders.VarCharHolder;
    import org.apache.arrow.vector.types.pojo.ArrowType;
    import javax.inject.Inject;
    import java.util.List;
     
    public class UAAPP {
        // derivation 很重要
        @FunctionTemplate(names = {"parse_user_agent"}, isDeterministic = false, derivation = UAGenOutput.class)
        public static class UA implements SimpleFunction {
            @Param
            VarCharHolder input;
            // 比较重要,需要使用ComplexWriter ,因为是复杂类型
            @Output
            ComplexWriter outWriter;
            @Inject
            ArrowBuf outBuffer;
            @Workspace
            nl.basjes.parse.useragent.UserAgentAnalyzer uaa;
            @Workspace
            List<String> allFileds;
            // 共享对象的初始化
            public void setup() {
                uaa = com.dalong.udf.UserAgentAnalyzerProvider.getInstance();
                allFileds= java.util.Arrays.asList("DeviceClass","DeviceName","DeviceBrand","DeviceCpu","OperatingSystemClass","OperatingSystemName","OperatingSystemVersion","OperatingSystemNameVersion","LayoutEngineClass","LayoutEngineName","LayoutEngineVersion","LayoutEngineVersionMajor","LayoutEngineNameVersion","LayoutEngineNameVersionMajor","AgentClass","AgentName","AgentVersion","AgentVersionMajor","AgentNameVersion","AgentNameVersionMajor");
            }
            public void eval() {
                org.apache.arrow.vector.complex.writer.BaseWriter.StructWriter queryMapWriter = outWriter.rootAsStruct();
                if (input.isSet == 0) {
                    // Return empty map
                    queryMapWriter.start();
                    queryMapWriter.end();
                    return;
                }
                String userAgentString = com.dremio.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(0,input.end, input.buffer);
                nl.basjes.parse.useragent.UserAgent agent = uaa.parse(userAgentString);
                queryMapWriter.start();
                for (String fieldName : allFileds){
                    org.apache.arrow.vector.holders.VarCharHolder rowHolder = new org.apache.arrow.vector.holders.VarCharHolder();
                    String field = agent.getValue(fieldName);
                    byte[] rowStringBytes = field.getBytes();
                    outBuffer.reallocIfNeeded(rowStringBytes.length);
                    outBuffer.setBytes(0, rowStringBytes);
                    rowHolder.start = 0;
                    rowHolder.end = rowStringBytes.length;
                    rowHolder.buffer = outBuffer;
                    queryMapWriter.varChar(fieldName).write(rowHolder);
                }
                queryMapWriter.end();
            }
        }
         // 此处比较重要,目前是固定的几个字段,实际上我们可以自己定义一个数据类型
        public static class UAGenOutput implements OutputDerivation {
            public CompleteType getOutputType(CompleteType baseReturn, List<LogicalExpression> args) {
                return new CompleteType(
                        ArrowType.Struct.INSTANCE,
                        CompleteType.VARCHAR.toField("DeviceClass"),
                        CompleteType.VARCHAR.toField("DeviceName"),
                        CompleteType.VARCHAR.toField("DeviceBrand"),
                        CompleteType.VARCHAR.toField("DeviceCpu"),
                        CompleteType.VARCHAR.toField("OperatingSystemClass"),
                        CompleteType.VARCHAR.toField("OperatingSystemName"),
                        CompleteType.VARCHAR.toField("OperatingSystemVersion"),
                        CompleteType.VARCHAR.toField("OperatingSystemNameVersion"),
                        CompleteType.VARCHAR.toField("LayoutEngineClass"),
                        CompleteType.VARCHAR.toField("LayoutEngineName"),
                        CompleteType.VARCHAR.toField("LayoutEngineVersion"),
                        CompleteType.VARCHAR.toField("LayoutEngineVersionMajor"),
                        CompleteType.VARCHAR.toField("LayoutEngineNameVersion"),
                        CompleteType.VARCHAR.toField("LayoutEngineNameVersionMajor"),
                        CompleteType.VARCHAR.toField("AgentClass"),
                        CompleteType.VARCHAR.toField("AgentName"),
                        CompleteType.VARCHAR.toField("AgentVersion"),
                        CompleteType.VARCHAR.toField("AgentVersionMajor"),
                        CompleteType.VARCHAR.toField("AgentNameVersion"),
                        CompleteType.VARCHAR.toField("AgentNameVersionMajor"));
            }
        }
    }

    使用

    • 编译
    mvn clean  package -DskipTests
    • copy jars
      主要需要同时包含源码(了解的话,dremio与drill 一样使用了基于java 代码生成执行处理,依赖了janino)
    • 制作一个docker 镜像
     
    FROM dremio/dremio-oss:13.0
    COPY dremio-func-2.0-SNAPSHOT.jar /opt/dremio/jars/
    COPY dremio-func-2.0-SNAPSHOT-sources.jar /opt/dremio/jars/
    COPY --from=hengyunabc/arthas:latest /opt/arthas /opt/arthas
    • 效果

    导入一些数据

     
    select parse_user_agent(ua),myinfo from mypg.public.ua2
     

    几个问题

    • 默认derivation
      提示信息
     
    com.google.inject.CreationException: Unable to create injector, see the following errors:
    1) Error in custom provider, java.lang.AssertionError: Function [com.dalong.udf.UA] has a ComplexWriter output but it's using the Default derivation   

    解决方法,需要实现自己的OutputDerivation,注意字段需要完整(同时我们使用的是struct,需要指明)

    • 类全名称
      这个与drill 是一样的,同时对于需要共享的对象使用@Workspace 注解

    参考资料

    https://github.com/rongfengliang/dremio-user-agent-parse-func
    https://github.com/dremio/dremio-oss/blob/master/sabot/kernel/src/main/java/com/dremio/exec/expr/fn/impl/Mappify.java

  • 相关阅读:
    vue --- 脚手架初始化项目中配置文件webpack.base.conf.js代码含义
    Chrome
    es8 --- 新特性
    es7 --- 新特性
    vue --- 关于多个router-view视图组件,渲染同一页面
    vue ---- 组件传值之间使用 v-model
    vue --- watch 高级用法
    js --- 递归结构图
    es6 --- Generator 函数
    es6 -- set 数据结构
  • 原文地址:https://www.cnblogs.com/rongfengliang/p/14401691.html
Copyright © 2011-2022 走看看