From 57ed66646d385de5c7dd4bd1d8a3031571ad6299 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Fri, 3 May 2024 20:21:15 -0700 Subject: [PATCH] Convert statistics based on column type --- src/metadata.js | 57 +++++++++++++++---- src/types.d.ts | 6 +- test/files/Int_Map.metadata.json | 8 +-- .../concatenated_gzip_members.metadata.json | 4 +- test/files/datapage_v2.snappy.metadata.json | 16 +++--- test/files/nonnullable.impala.metadata.json | 32 +++++------ test/files/rowgroups.metadata.json | 16 +++--- 7 files changed, 88 insertions(+), 51 deletions(-) diff --git a/src/metadata.js b/src/metadata.js index ccc6271..2999d84 100644 --- a/src/metadata.js +++ b/src/metadata.js @@ -115,9 +115,11 @@ export function parquetMetadata(arrayBuffer) { field_id: field.field_9, logical_type: logicalType(field.field_10), })) + // @ts-expect-error get types by column index + const columnTypes = schema.map(e => e.type).filter(e => e) const num_rows = metadata.field_3 const row_groups = metadata.field_4.map((/** @type {any} */ rowGroup) => ({ - columns: rowGroup.field_1.map((/** @type {any} */ column) => ({ + columns: rowGroup.field_1.map((/** @type {any} */ column, /** @type {number} */ columnIndex) => ({ file_path: decode(column.field_1), file_offset: column.field_2, meta_data: column.field_3 && { @@ -132,16 +134,7 @@ export function parquetMetadata(arrayBuffer) { data_page_offset: column.field_3.field_9, index_page_offset: column.field_3.field_10, dictionary_page_offset: column.field_3.field_11, - statistics: column.field_3.field_12 && { - max: decode(column.field_3.field_12.field_1), - min: decode(column.field_3.field_12.field_2), - null_count: column.field_3.field_12.field_3, - distinct_count: column.field_3.field_12.field_4, - max_value: decode(column.field_3.field_12.field_5), - min_value: decode(column.field_3.field_12.field_6), - is_max_value_exact: column.field_3.field_12.field_7, - is_min_value_exact: column.field_3.field_12.field_8, - }, + statistics: columnStats(column.field_3.field_12, columnTypes[columnIndex]), encoding_stats: column.field_3.field_13?.map((/** @type {any} */ encodingStat) => ({ page_type: encodingStat.field_1, encoding: Encoding[encodingStat.field_2], @@ -228,3 +221,45 @@ function logicalType(logicalType) { return logicalType } } + +/** + * Convert column statistics based on column type. + * + * @param {any} stats + * @param {import("./types.d.ts").ParquetType} type + * @returns {import("./types.d.ts").Statistics} + */ +function columnStats(stats, type) { + function convert(/** @type {Uint8Array} */ value) { + if (value === undefined) return value + if (type === 'BOOLEAN') return value[0] === 1 + if (type === 'BYTE_ARRAY') return new TextDecoder().decode(value) + if (type === 'INT32') { + const view = new DataView(value.buffer, value.byteOffset, value.byteLength) + return view.getInt32(0, true) + } + if (type === 'INT64') { + const view = new DataView(value.buffer, value.byteOffset, value.byteLength) + return view.getBigInt64(0, true) + } + if (type === 'FLOAT') { + const view = new DataView(value.buffer, value.byteOffset, value.byteLength) + return view.getFloat32(0, true) + } + if (type === 'DOUBLE') { + const view = new DataView(value.buffer, value.byteOffset, value.byteLength) + return view.getFloat64(0, true) + } + return value + } + return stats && { + max: convert(stats.field_1), + min: convert(stats.field_2), + null_count: stats.field_3, + distinct_count: stats.field_4, + max_value: convert(stats.field_5), + min_value: convert(stats.field_6), + is_max_value_exact: stats.field_7, + is_min_value_exact: stats.field_8, + } +} diff --git a/src/types.d.ts b/src/types.d.ts index d770bce..79b456d 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -192,9 +192,11 @@ interface KeyValue { value?: string } +type MinMaxType = bigint | boolean | number | string + export interface Statistics { - max?: string - min?: string + max?: MinMaxType + min?: MinMaxType null_count?: bigint distinct_count?: bigint max_value?: string diff --git a/test/files/Int_Map.metadata.json b/test/files/Int_Map.metadata.json index dc6a681..7bcbaa6 100644 --- a/test/files/Int_Map.metadata.json +++ b/test/files/Int_Map.metadata.json @@ -34,10 +34,10 @@ "num_values": 10, "path_in_schema": ["int_map", "key_value", "value"], "statistics": { - "max": "d\u0000\u0000\u0000", - "min": "\u0001\u0000\u0000\u0000", - "max_value": "d\u0000\u0000\u0000", - "min_value": "\u0001\u0000\u0000\u0000" + "max": 100, + "min": 1, + "max_value": 100, + "min_value": 1 }, "total_compressed_size": 60, "total_uncompressed_size": 59, diff --git a/test/files/concatenated_gzip_members.metadata.json b/test/files/concatenated_gzip_members.metadata.json index 7405607..c078a58 100644 --- a/test/files/concatenated_gzip_members.metadata.json +++ b/test/files/concatenated_gzip_members.metadata.json @@ -19,8 +19,8 @@ "long_col" ], "statistics": { - "max_value": "\u0001\u0002\u0000\u0000\u0000\u0000\u0000\u0000", - "min_value": "\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000" + "max_value": 513, + "min_value": 1 }, "total_compressed_size": 1467, "total_uncompressed_size": 4155, diff --git a/test/files/datapage_v2.snappy.metadata.json b/test/files/datapage_v2.snappy.metadata.json index a5b583f..0f7bb2c 100644 --- a/test/files/datapage_v2.snappy.metadata.json +++ b/test/files/datapage_v2.snappy.metadata.json @@ -39,8 +39,8 @@ "num_values": 5, "path_in_schema": ["b"], "statistics": { - "max": "\u0005\u0000\u0000\u0000", - "min": "\u0001\u0000\u0000\u0000", + "max": 5, + "min": 1, "null_count": 0 }, "total_compressed_size": 49, @@ -57,8 +57,8 @@ "num_values": 5, "path_in_schema": ["c"], "statistics": { - "max": "\u0000\u0000\u0000\u0000\u0000\u0000\u0014@", - "min": "\u0000\u0000\u0000\u0000\u0000\u0000\u0000@", + "max": 5, + "min": 2, "null_count": 0 }, "total_compressed_size": 88, @@ -75,8 +75,8 @@ "num_values": 5, "path_in_schema": ["d"], "statistics": { - "max": "\u0001", - "min": "\u0000", + "max": true, + "min": false, "null_count": 0 }, "total_compressed_size": 39, @@ -97,8 +97,8 @@ "element" ], "statistics": { - "max": "\u0003\u0000\u0000\u0000", - "min": "\u0001\u0000\u0000\u0000", + "max": 3, + "min": 1, "null_count": 2 }, "total_compressed_size": 78, diff --git a/test/files/nonnullable.impala.metadata.json b/test/files/nonnullable.impala.metadata.json index 05e1517..b39443c 100644 --- a/test/files/nonnullable.impala.metadata.json +++ b/test/files/nonnullable.impala.metadata.json @@ -21,8 +21,8 @@ "num_values": 1, "path_in_schema": [ "ID" ], "statistics": { - "max": "\b\u0000\u0000\u0000\u0000\u0000\u0000\u0000", - "min": "\b\u0000\u0000\u0000\u0000\u0000\u0000\u0000", + "max": 8, + "min": 8, "null_count": 0 }, "total_compressed_size": 49, @@ -39,8 +39,8 @@ "num_values": 1, "path_in_schema": [ "Int_Array", "list", "element" ], "statistics": { - "max": "����", - "min": "����", + "max": -1, + "min": -1, "null_count": 0 }, "total_compressed_size": 49, @@ -63,8 +63,8 @@ "element" ], "statistics": { - "max": "����", - "min": "����", + "max": -1, + "min": -2, "null_count": 1 }, "total_compressed_size": 55, @@ -99,8 +99,8 @@ "num_values": 1, "path_in_schema": [ "Int_Map", "map", "value" ], "statistics": { - "max": "����", - "min": "����", + "max": -1, + "min": -1, "null_count": 0 }, "total_compressed_size": 49, @@ -147,8 +147,8 @@ "value" ], "statistics": { - "max": "\u0001\u0000\u0000\u0000", - "min": "\u0001\u0000\u0000\u0000", + "max": 1, + "min": 1, "null_count": 3 }, "total_compressed_size": 51, @@ -165,8 +165,8 @@ "num_values": 1, "path_in_schema": [ "nested_Struct", "a" ], "statistics": { - "max": "����", - "min": "����", + "max": -1, + "min": -1, "null_count": 0 }, "total_compressed_size": 37, @@ -183,8 +183,8 @@ "num_values": 1, "path_in_schema": [ "nested_Struct", "B", "list", "element" ], "statistics": { - "max": "����", - "min": "����", + "max": -1, + "min": -1, "null_count": 0 }, "total_compressed_size": 49, @@ -210,8 +210,8 @@ "e" ], "statistics": { - "max": "����", - "min": "����", + "max": -1, + "min": -1, "null_count": 0 }, "total_compressed_size": 51, diff --git a/test/files/rowgroups.metadata.json b/test/files/rowgroups.metadata.json index 58516b2..b9e6c78 100644 --- a/test/files/rowgroups.metadata.json +++ b/test/files/rowgroups.metadata.json @@ -32,10 +32,10 @@ "num_values": 10, "path_in_schema": ["numbers"], "statistics": { - "max": "\n\u0000\u0000\u0000\u0000\u0000\u0000\u0000", - "min": "\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000", - "max_value": "\n\u0000\u0000\u0000\u0000\u0000\u0000\u0000", - "min_value": "\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000", + "max": 10, + "min": 1, + "max_value": 10, + "min_value": 1, "null_count": 0 }, "total_compressed_size": 146, @@ -66,10 +66,10 @@ "num_values": 5, "path_in_schema": ["numbers"], "statistics": { - "max": "\u000f\u0000\u0000\u0000\u0000\u0000\u0000\u0000", - "min": "\u000b\u0000\u0000\u0000\u0000\u0000\u0000\u0000", - "max_value": "\u000f\u0000\u0000\u0000\u0000\u0000\u0000\u0000", - "min_value": "\u000b\u0000\u0000\u0000\u0000\u0000\u0000\u0000", + "max": 15, + "min": 11, + "max_value": 15, + "min_value": 11, "null_count": 0 }, "total_compressed_size": 120,