zoukankan      html  css  js  c++  java
  • excel 到dremio 查询处理

    参考解决方案

    原理说明

    用户通过rest api 请求上传excel 文件,可以存储到s3中,然后我们可以通过监听s3的webhooks 或者上传的时候进行格式转换同时存储s3
    格式可以支持的包含jsonlines,parquet。。。同时基于dremio直接查询s3的能力,方便的进行业务分析

    环境准备

     
    version: "3"
    services:
      dremio:
          image: dremio/dremio-oss
          ports:
            - "9047:9047"
      s3:
          image: minio/minio
          environment:
            - "MINIO_ACCESS_KEY=minio"
            - "MINIO_SECRET_KEY=minio123"
          command: server /data  --console-address ":9001"
          ports:
            - "9000:9000"
            - "9001:9001"

    参考代码

    • pom
    <?xml version="1.0" encoding="UTF-8"?>
    <project xmlns="http://maven.apache.org/POM/4.0.0"
             xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
             xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
        <modelVersion>4.0.0</modelVersion>
     
        <groupId>org.example</groupId>
        <artifactId>excel-reader</artifactId>
        <version>1.0-SNAPSHOT</version>
     
        <properties>
            <maven.compiler.source>8</maven.compiler.source>
            <maven.compiler.target>8</maven.compiler.target>
        </properties>
     
        <dependencies>
            <dependency>
                <groupId>org.projectlombok</groupId>
                <artifactId>lombok</artifactId>
                <version>1.18.20</version>
                <scope>provided</scope>
                <optional>true</optional>
            </dependency>
            <dependency>
                <groupId>com.alibaba</groupId>
                <artifactId>easyexcel</artifactId>
                <version>2.2.8</version>
            </dependency>
            <dependency>
                <groupId>io.minio</groupId>
                <artifactId>minio</artifactId>
                <version>8.3.0</version>
                <exclusions>
                    <exclusion>
                        <groupId>com.fasterxml.jackson.core</groupId>
                        <artifactId>jackson-core</artifactId>
                    </exclusion>
                    <exclusion>
                        <groupId>com.fasterxml.jackson.core</groupId>
                        <artifactId>jackson-annotations</artifactId>
                    </exclusion>
                    <exclusion>
                        <groupId>com.fasterxml.jackson.core</groupId>
                        <artifactId>jackson-databind</artifactId>
                    </exclusion>
                </exclusions>
            </dependency>
            <dependency>
                <groupId>commons-io</groupId>
                <artifactId>commons-io</artifactId>
                <version>2.11.0</version>
            </dependency>
            <dependency>
                <groupId>com.fasterxml.jackson.core</groupId>
                <artifactId>jackson-databind</artifactId>
                <version>2.12.5</version>
            </dependency>
        </dependencies>
    </project> 
    • java 代码
      代码使用了easyexcel 进行文件读取,同时基于jackson 生成jsonlines 文件,之后存储s3
     
    package com.dalong;
    import com.alibaba.excel.EasyExcel;
    import com.alibaba.excel.context.AnalysisContext;
    import com.alibaba.excel.enums.CellExtraTypeEnum;
    import com.alibaba.excel.event.AnalysisEventListener;
    import com.alibaba.excel.metadata.CellExtra;
    import com.fasterxml.jackson.databind.ObjectMapper;
    import com.fasterxml.jackson.databind.SequenceWriter;
    import io.minio.MinioClient;
    import io.minio.PutObjectArgs;
    import io.minio.errors.*;
    import lombok.SneakyThrows;
     
    import java.io.ByteArrayInputStream;
    import java.io.ByteArrayOutputStream;
    import java.io.IOException;
    import java.io.InputStream;
    import java.security.InvalidKeyException;
    import java.security.NoSuchAlgorithmException;
    import java.util.ArrayList;
    import java.util.List;
    import java.util.function.Consumer;
     
    public class Application {
        private static final ObjectMapper objectMapper = new ObjectMapper();
        private static MinioClient minioClient =
                MinioClient.builder()
                        .endpoint("http://localhost:9000")
                        .credentials("minio", "minio123")
                        .build();
        public static void main(String[] args) {
            readExcelFile();
        }
        public static void writeExcelContent2S3JSONLines(List<Object> input) throws ServerException, InsufficientDataException, ErrorResponseException, IOException, NoSuchAlgorithmException, InvalidKeyException, InvalidResponseException, XmlParserException, InternalException {
            try (ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream()) {
                try (SequenceWriter seq = objectMapper.writer()
                        .withRootValueSeparator("
    ").writeValues(byteArrayOutputStream)) {
                    input.forEach(new Consumer<Object>() {
                        @SneakyThrows
                        @Override
                        public void accept(Object s) {
                            seq.write(s);
                        }
                    });
                    try (ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(byteArrayOutputStream.toByteArray())) {
                        minioClient.putObject(
                                PutObjectArgs.builder().bucket("demo").object("demoapp.ldjson").stream(
                                                byteArrayInputStream, -1, 10485760)
                                        .build());
                    }
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
     
     
        }
     
        private static void readExcelFile() {
            InputStream stream = Application.class.getClassLoader().getResourceAsStream("excel-jsonlines.xlsx");
            EasyExcel.read(stream, new AnalysisEventListener() {
                private List<Object> results = new ArrayList<>();
     
                @SneakyThrows
                @Override
                public void invoke(Object o, AnalysisContext analysisContext) {
                    results.add(o);
                    System.out.println(objectMapper.writeValueAsString(o));
                }
     
                @SneakyThrows
                @Override
                public void extra(CellExtra extra, AnalysisContext context) {
                    System.out.println(extra.getType());
                    System.out.println(objectMapper.writeValueAsString(extra));
                }
     
                @SneakyThrows
                @Override
                public void doAfterAllAnalysed(AnalysisContext analysisContext) {
                    writeExcelContent2S3JSONLines(results);
                    System.out.println("after");
                }
            }).extraRead(CellExtraTypeEnum.MERGE).sheet().doRead();
        }
    }
     
    • dremio 查询效果

    说明

    实际上dremio 是可以读取excel的,但是很多时候可能因为一些限制或者原因,造成使用问题(比如dremio 对于excel 文件大小限制,以及数据使用场景的问题)
    基于上边的方式都是一个不错的选择

    参考资料

    https://jsonlines.org/examples/
    https://www.yuque.com/easyexcel/doc/easyexcel

  • 相关阅读:
    IDEA-各模块间引用出现问题的解决方法
    【MyBatis学习06】_parameter:解决There is no getter for property named in class java.lang.String
    《转载》JVM垃圾回收机制
    java面试复习题四
    Java中excel转换为jpg/png图片 采用aspose-cells-18.6.jar
    POI导出复杂的excel;excel公共样式类;excel拼接定制类;数据科学计数法转为普通值
    java发送邮件无法显示图片 图裂 的解决办法
    pom.xml文件最详细的讲解
    Tomcat启动报Error listenerStart错误 Context [] startup failed due to previous errors
    ora-01031:insufficient privileges解决方法
  • 原文地址:https://www.cnblogs.com/rongfengliang/p/15216547.html
Copyright © 2011-2022 走看看