zoukankan      html  css  js  c++  java
  • iceberg文件详解

    一、数据内容

    t20
    ├── data
    │   ├── 00000-0-9c7ff22e-a767-4b85-91ec-a2771e54c209-00001.parquet
    │   └── 00000-0-ecd3f21c-1bc0-4cdc-8917-d9a1afe7ce55-00001.parquet
    └── metadata
    ├── 00000-d864e750-e5e2-4afd-bddb-2fab1e627a21.metadata.json
    ├── 00001-aabfd9a8-7dcd-4aa0-99aa-f6695f39bf6b.metadata.json
    ├── 00002-b5b7725f-7e86-454b-8d16-0e142bc84266.metadata.json
    ├── 0254b8b6-4d76-473c-86c2-97acda68d587-m0.avro
    ├── f787e035-8f7c-43a3-b264-42057bad2710-m0.avro
    ├── snap-6190364701448945732-1-0254b8b6-4d76-473c-86c2-97acda68d587.avro
    └── snap-6460256963744122971-1-f787e035-8f7c-43a3-b264-42057bad2710.avro
     

    二、文件详解

    data是数据,metadata是元数据
    建表时会生成metadata/00000-xx.metadata.json
    每做一次insert会生成元数据和数据,会生成新的00001-xx.metadata.json ..

    1、数据

    xxx.parquet
    $ parquet head ~/t20/data/00000-0-ecd3f21c-1bc0-4cdc-8917-d9a1afe7ce55-00001.parquet                       
    {"id": 20}
    $ parquet head ~/t20/data/00000-0-9c7ff22e-a767-4b85-91ec-a2771e54c209-00001.parquet  
    {"id": 10}

    2、元数据

    (1)xxx.metadata.json

    从hive metastore的mysql库的TABLE_PARAMS可以查到表的metastore_location位置,即xxx.metadata.json,可以拿到当前表的快照 id(current-snapshot-id),以及这张表的所有快照信息,也就是 JSON 信息里面的 snapshots 数组对应的值

    (2)清单列表(相当于snapshot):snap--xxx.avro

    每个快照包含的一系列清单文件,每行中存储了清单文件的路径、清单文件里面存储数据文件的分区范围、增加了几个数据文件、删除了几个数据文件等信息。这些信息可以用来在查询时提供过滤
    manifest_path
    manifest_length
    partition_spec_id
    added_snapshot_id
    added_data_files_count
    existing_data_files_count
    deleted_data_files_count
    partitions
    added_rows_count
    existing_rows_count
    deleted_rows_count
    hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/0254b8b6-4d76-473c-86c2-97acda68d587-m0.avro
    5514
    0
    6190364701448940000
    1
    0
    0
    []
    1
    0
    0
    hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/0254b8b6-4d76-473c-86c2-97acda68d587-m0.avro
    5514
    0
    6190364701448940000
    1
    0
    0
    []
    1
    0
    0
     

    (3)清单:xxx.avro

    每行都是每个数据文件的详细描述,包括数据文件的状态、文件路径、分区信息、列级别的统计信息(比如每列的最大最小值、空值数等)、文件的大小以及文件里面数据的行数等信息。其中列级别的统计信息在 Scan 的时候可以为算子下推提供数据,以便可以过滤掉不必要的文件
    {
      "status": 1,
      "snapshot_id": {"long": 6460256963744123000},
      "data_file": {
        "file_path": "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/data/00000-0-ecd3f21c-1bc0-4cdc-8917-d9a1afe7ce55-00001.parquet",
        "file_format": "PARQUET",
        "partition": {},
        "record_count": 1,
        "file_size_in_bytes": 387,
        "block_size_in_bytes": 67108864,
        "column_sizes": {
          "array": [{ "key": 1, "value": 51}]
        },
        "value_counts": {
          "array": [{"key": 1,"value": 1}]
        },
        "null_value_counts": {
          "array": [{"key": 1,"value": 0}]
        },
        "nan_value_counts": {"array": []},
        "lower_bounds": {
          "array": [{"key": 1,"value": "u0014u0000u0000u0000"}]
        },
        "upper_bounds": {
          "array": [{"key": 1,"value": "u0014u0000u0000u0000"}]
        },
        "key_metadata": null,
        "split_offsets": {
          "array": [4]
        }
      }
    }

     

     
    以下是完整的metadata目录下的文件内容,有兴趣的可以再深究
     
    metadata/00001-aabfd9a8-7dcd-4aa0-99aa-f6695f39bf6b.metadata.json
    {
      "format-version" : 1,
      "table-uuid" : "900edf11-3434-408d-a789-a6a5acecdca3",
      "location" : "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20",
      "last-updated-ms" : 1619022202031,
      "last-column-id" : 1,
      "schema" : {
        "type" : "struct",
        "fields" : [ {
          "id" : 1,
          "name" : "id",
          "required" : false,
          "type" : "int"
        } ]
      },
      "partition-spec" : [ ],
      "default-spec-id" : 0,
      "partition-specs" : [ {
        "spec-id" : 0,
        "fields" : [ ]
      } ],
      "default-sort-order-id" : 0,
      "sort-orders" : [ {
        "order-id" : 0,
        "fields" : [ ]
      } ],
      "properties" : { },
      "current-snapshot-id" : 6190364701448945732,
      "snapshots" : [ {
        "snapshot-id" : 6190364701448945732,
        "timestamp-ms" : 1619022202031,
        "summary" : {
          "operation" : "append",
          "flink.job-id" : "93d92dedbddaf202ac2a2beb9d381084",
          "flink.max-committed-checkpoint-id" : "9223372036854775807",
          "added-data-files" : "1",
          "added-records" : "1",
          "added-files-size" : "387",
          "changed-partition-count" : "1",
          "total-records" : "1",
          "total-data-files" : "1",
          "total-delete-files" : "0",
          "total-position-deletes" : "0",
          "total-equality-deletes" : "0"
        },
        "manifest-list" : "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/snap-6190364701448945732-1-0254b8b6-4d76-473c-86c2-97acda68d587.avro"
      } ],
      "snapshot-log" : [ {
        "timestamp-ms" : 1619022202031,
        "snapshot-id" : 6190364701448945732
      } ],
      "metadata-log" : [ {
        "timestamp-ms" : 1619020518215,
        "metadata-file" : "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/00000-d864e750-e5e2-4afd-bddb-2fab1e627a21.metadata.json"
      } ]
    }

    metadata/00002-b5b7725f-7e86-454b-8d16-0e142bc84266.metadata.json

    {
      "format-version" : 1,
      "table-uuid" : "900edf11-3434-408d-a789-a6a5acecdca3",
      "location" : "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20",
      "last-updated-ms" : 1619023435305,
      "last-column-id" : 1,
      "schema" : {
        "type" : "struct",
        "fields" : [ {
          "id" : 1,
          "name" : "id",
          "required" : false,
          "type" : "int"
        } ]
      },
      "partition-spec" : [ ],
      "default-spec-id" : 0,
      "partition-specs" : [ {
        "spec-id" : 0,
        "fields" : [ ]
      } ],
      "default-sort-order-id" : 0,
      "sort-orders" : [ {
        "order-id" : 0,
        "fields" : [ ]
      } ],
      "properties" : { },
      "current-snapshot-id" : 6460256963744122971,
      "snapshots" : [ {
        "snapshot-id" : 6190364701448945732,
        "timestamp-ms" : 1619022202031,
        "summary" : {
          "operation" : "append",
          "flink.job-id" : "93d92dedbddaf202ac2a2beb9d381084",
          "flink.max-committed-checkpoint-id" : "9223372036854775807",
          "added-data-files" : "1",
          "added-records" : "1",
          "added-files-size" : "387",
          "changed-partition-count" : "1",
          "total-records" : "1",
          "total-data-files" : "1",
          "total-delete-files" : "0",
          "total-position-deletes" : "0",
          "total-equality-deletes" : "0"
        },
        "manifest-list" : "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/snap-6190364701448945732-1-0254b8b6-4d76-473c-86c2-97acda68d587.avro"
      }, {
        "snapshot-id" : 6460256963744122971,
        "parent-snapshot-id" : 6190364701448945732,
        "timestamp-ms" : 1619023435305,
        "summary" : {
          "operation" : "append",
          "flink.job-id" : "3be57424a6547f41f1df350f9667ae65",
          "flink.max-committed-checkpoint-id" : "9223372036854775807",
          "added-data-files" : "1",
          "added-records" : "1",
          "added-files-size" : "387",
          "changed-partition-count" : "1",
          "total-records" : "2",
          "total-data-files" : "2",
          "total-delete-files" : "0",
          "total-position-deletes" : "0",
          "total-equality-deletes" : "0"
        },
        "manifest-list" : "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/snap-6460256963744122971-1-f787e035-8f7c-43a3-b264-42057bad2710.avro"
      } ],
      "snapshot-log" : [ {
        "timestamp-ms" : 1619022202031,
        "snapshot-id" : 6190364701448945732
      }, {
        "timestamp-ms" : 1619023435305,
        "snapshot-id" : 6460256963744122971
      } ],
      "metadata-log" : [ {
        "timestamp-ms" : 1619020518215,
        "metadata-file" : "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/00000-d864e750-e5e2-4afd-bddb-2fab1e627a21.metadata.json"
      }, {
        "timestamp-ms" : 1619022202031,
        "metadata-file" : "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/00001-aabfd9a8-7dcd-4aa0-99aa-f6695f39bf6b.metadata.json"
      } ]
    }

    metadata/00000-d864e750-e5e2-4afd-bddb-2fab1e627a21.metadata.json

    {
      "format-version" : 1,
      "table-uuid" : "900edf11-3434-408d-a789-a6a5acecdca3",
      "location" : "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20",
      "last-updated-ms" : 1619020518215,
      "last-column-id" : 1,
      "schema" : {
        "type" : "struct",
        "schema-id" : 0,
        "fields" : [ {
          "id" : 1,
          "name" : "id",
          "required" : false,
          "type" : "int"
        } ]
      },
      "current-schema-id" : 0,
      "schemas" : [ {
        "type" : "struct",
        "schema-id" : 0,
        "fields" : [ {
          "id" : 1,
          "name" : "id",
          "required" : false,
          "type" : "int"
        } ]
      } ],
      "partition-spec" : [ ],
      "default-spec-id" : 0,
      "partition-specs" : [ {
        "spec-id" : 0,
        "fields" : [ ]
      } ],
      "last-partition-id" : 999,
      "default-sort-order-id" : 0,
      "sort-orders" : [ {
        "order-id" : 0,
        "fields" : [ ]
      } ],
      "properties" : { },
      "current-snapshot-id" : -1,
      "snapshots" : [ ],
      "snapshot-log" : [ ],
      "metadata-log" : [ ]
    }

     metadata/0254b8b6-4d76-473c-86c2-97acda68d587-m0.avro

    {
      "status": 1,
      "snapshot_id": {
        "long": 6190364701448946000
      },
      "data_file": {
        "file_path": "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/data/00000-0-9c7ff22e-a767-4b85-91ec-a2771e54c209-00001.parquet",
        "file_format": "PARQUET",
        "partition": {},
        "record_count": 1,
        "file_size_in_bytes": 387,
        "block_size_in_bytes": 67108864,
        "column_sizes": {
          "array": [
            {
              "key": 1,
              "value": 51
            }
          ]
        },
        "value_counts": {
          "array": [
            {
              "key": 1,
              "value": 1
            }
          ]
        },
        "null_value_counts": {
          "array": [
            {
              "key": 1,
              "value": 0
            }
          ]
        },
        "nan_value_counts": {
          "array": []
        },
        "lower_bounds": {
          "array": [
            {
              "key": 1,
              "value": "
    u0000u0000u0000"
            }
          ]
        },
        "upper_bounds": {
          "array": [
            {
              "key": 1,
              "value": "
    u0000u0000u0000"
            }
          ]
        },
        "key_metadata": null,
        "split_offsets": {
          "array": [
            4
          ]
        }
      }
    }

    metadata/f787e035-8f7c-43a3-b264-42057bad2710-m0.avro

    {
      "status": 1,
      "snapshot_id": {
        "long": 6460256963744123000
      },
      "data_file": {
        "file_path": "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/data/00000-0-ecd3f21c-1bc0-4cdc-8917-d9a1afe7ce55-00001.parquet",
        "file_format": "PARQUET",
        "partition": {},
        "record_count": 1,
        "file_size_in_bytes": 387,
        "block_size_in_bytes": 67108864,
        "column_sizes": {
          "array": [
            {
              "key": 1,
              "value": 51
            }
          ]
        },
        "value_counts": {
          "array": [
            {
              "key": 1,
              "value": 1
            }
          ]
        },
        "null_value_counts": {
          "array": [
            {
              "key": 1,
              "value": 0
            }
          ]
        },
        "nan_value_counts": {
          "array": []
        },
        "lower_bounds": {
          "array": [
            {
              "key": 1,
              "value": "u0014u0000u0000u0000"
            }
          ]
        },
        "upper_bounds": {
          "array": [
            {
              "key": 1,
              "value": "u0014u0000u0000u0000"
            }
          ]
        },
        "key_metadata": null,
        "split_offsets": {
          "array": [
            4
          ]
        }
      }
    }

    metadata/snap-6190364701448945732-1-0254b8b6-4d76-473c-86c2-97acda68d587.avro

    {
      "manifest_path": "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/0254b8b6-4d76-473c-86c2-97acda68d587-m0.avro",
      "manifest_length": 5514,
      "partition_spec_id": 0,
      "added_snapshot_id": {
        "long": 6190364701448946000
      },
      "added_data_files_count": {
        "int": 1
      },
      "existing_data_files_count": {
        "int": 0
      },
      "deleted_data_files_count": {
        "int": 0
      },
      "partitions": {
        "array": []
      },
      "added_rows_count": {
        "long": 1
      },
      "existing_rows_count": {
        "long": 0
      },
      "deleted_rows_count": {
        "long": 0
      }
    }

    metadata/snap-6460256963744122971-1-f787e035-8f7c-43a3-b264-42057bad2710.avro

    {
      "manifest_path": "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/f787e035-8f7c-43a3-b264-42057bad2710-m0.avro",
      "manifest_length": 5514,
      "partition_spec_id": 0,
      "added_snapshot_id": {
        "long": 6460256963744123000
      },
      "added_data_files_count": {
        "int": 1
      },
      "existing_data_files_count": {
        "int": 0
      },
      "deleted_data_files_count": {
        "int": 0
      },
      "partitions": {
        "array": []
      },
      "added_rows_count": {
        "long": 1
      },
      "existing_rows_count": {
        "long": 0
      },
      "deleted_rows_count": {
        "long": 0
      }
    }
    {
      "manifest_path": "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/0254b8b6-4d76-473c-86c2-97acda68d587-m0.avro",
      "manifest_length": 5514,
      "partition_spec_id": 0,
      "added_snapshot_id": {
        "long": 6190364701448946000
      },
      "added_data_files_count": {
        "int": 1
      },
      "existing_data_files_count": {
        "int": 0
      },
      "deleted_data_files_count": {
        "int": 0
      },
      "partitions": {
        "array": []
      },
      "added_rows_count": {
        "long": 1
      },
      "existing_rows_count": {
        "long": 0
      },
      "deleted_rows_count": {
        "long": 0
      }
    }
  • 相关阅读:
    k8s-学习笔记12-权限体系
    Linux上磁盘热插拔
    delphi hashmap
    my gcc project
    gcc dll 导出问题 GTK+Glade3 Gtk-WARNING **: Could not find signal handler 问题最终解析
    c/c++字符串定义及使用的对比
    gcc printf()打印char* str
    gcc选项-g与-rdynamic的异同
    GCC编译,库的编译使用及Makefile
    gcc test
  • 原文地址:https://www.cnblogs.com/codetouse/p/14783454.html
Copyright © 2011-2022 走看看