zoukankan      html  css  js  c++  java
  • java

    读取orc文件

        @Test
        public void readOrc() throws IOException {
            Configuration conf = new Configuration();
            Reader reader = OrcFile.createReader(new Path("/tmp/Orc.orc"),
                    OrcFile.readerOptions(conf));
            RecordReader rows = reader.rows();
            VectorizedRowBatch batch = reader.getSchema().createRowBatch();
            while (rows.nextBatch(batch)) {
                System.out.println(batch.toString());
            }
            rows.close();
        }

    写orc文件---一行

        @Test
        public void writeLine3() throws IOException {
            Configuration conf = new Configuration();
            TypeDescription schema = TypeDescription.fromString("struct<x:int,y:int>");
            Writer writer = OrcFile.createWriter(new Path("/tmp/Orc.orc"),
                    OrcFile.writerOptions(conf)
                            .setSchema(schema));
            VectorizedRowBatch batch = schema.createRowBatch();
            LongColumnVector x = (LongColumnVector) batch.cols[0];
            LongColumnVector y = (LongColumnVector) batch.cols[1];
            int row = batch.size++;
            x.vector[row] = 2;
            y.vector[row] = 2 * 3;
            if (batch.size != 0) {
                writer.addRowBatch(batch);
                batch.reset();
            }
            writer.close();
        }

    写orc文件--多行

        @Test
        public void writeLine2() throws IOException {
            String[] lines = new String[]{"1,a,aa", "2,b,bb", "3,c,cc", "4,d,dd", "1,a,aa", "2,b,bb", "3,c,cc", "4,d,dd", "1,a,aa", "2,b,bb", "3,c,cc", "4,d,dd", "1,a,aa", "2,b,bb", "3,c,cc", "4,d,dd"};
    //        String[] lines = new String[]{"1,2,4", "1,2,3", "1,2,3", "1,2,3", "1,2,3", "1,2,3", "1,2,3", "1,2,3"};
    
    
            Configuration conf = new Configuration();
            TypeDescription schema = TypeDescription.fromString("struct<field1:String,field2:String,field3:String>");
    //        TypeDescription schema = TypeDescription.fromString("struct<field1:int,field2:int,field3:int>");
            Writer writer = OrcFile.createWriter(new Path("/tmp/Orc.orc"),
                    OrcFile.writerOptions(conf)
                            .setSchema(schema).overwrite(true));
            VectorizedRowBatch batch = schema.createRowBatch();
            List<? super ColumnVector> columnVectors = new ArrayList<>();
    
            for (int i = 0; i < batch.numCols; i++) {
                columnVectors.add(batch.cols[i]);
            }
    
            for (String line : lines) {
                String[] columns = line.split(",");
                System.out.println(batch.size);
                int row = batch.size++;
                for (int i = 0; i < columns.length; i++) {
                    switch (columnVectors.get(i).getClass().getName()) {
                        case "org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector":
                            BytesColumnVector bytesColumnVector = BytesColumnVector.class.cast(columnVectors.get(i));
                            bytesColumnVector.setVal(row, columns[i].getBytes(), 0, columns[i].getBytes().length);
                            break;
                        case "org.apache.hadoop.hive.ql.exec.vector.LongColumnVector":
                            LongColumnVector longColumnVector = LongColumnVector.class.cast(columnVectors.get(i));
                            longColumnVector.vector[row] = Long.parseLong(columns[i]);
                            break;
                        case "org.apache.hadoop.hive.ql.exec.vector.Decimal64ColumnVector":
                            Decimal64ColumnVector decimal64ColumnVector = Decimal64ColumnVector.class.cast(columnVectors.get(i));
                            decimal64ColumnVector.set(row, HiveDecimal.create(columns[i]));
                            break;
                        case "org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector":
                            DecimalColumnVector decimalColumnVector = DecimalColumnVector.class.cast(columnVectors.get(i));
                            decimalColumnVector.set(row, HiveDecimal.create(columns[i]));
                            break;
                        case "org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector":
                            DoubleColumnVector doubleColumnVector = DoubleColumnVector.class.cast(columnVectors.get(i));
                            doubleColumnVector.vector[row] = Double.parseDouble(columns[i]);
                            break;
                        case "org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector":
                            TimestampColumnVector timestampColumnVector = TimestampColumnVector.class.cast(columnVectors.get(i));
                            timestampColumnVector.set(row, java.sql.Timestamp.valueOf(columns[i]));
                            break;
                    }
                    if (batch.size == batch.getMaxSize()) {
                        writer.addRowBatch(batch);
                        batch.reset();
                    }
                }
            }
            if (batch.size != 0) {
                writer.addRowBatch(batch);
                batch.reset();
            }
            writer.close();
    
        }

    引用jar

    import org.apache.hadoop.conf.Configuration;
    import org.apache.hadoop.fs.FileSystem;
    import org.apache.hadoop.fs.Path;
    import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
    import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
    import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
    import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
    import org.apache.orc.*;
    import org.junit.Test;
    
    import java.io.File;
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.List;
  • 相关阅读:
    Java8 Optional使用方式
    ABAC框架-casbin
    Java数据脱敏(手机号|邮箱号|身份证号|银行卡号)
    使用OpenOffice将office文件转为pdf
    在线审批流设计
    Java 将带逗号的字符串转为List
    Java8 lambda常用操作
    Markdown合并单元格
    本博客已搬迁至rcst.xyz
    涂色(题解)
  • 原文地址:https://www.cnblogs.com/BigWrite/p/13651478.html
Copyright © 2011-2022 走看看