Convert statistics based on column type

This commit is contained in:
Kenny Daniel 2024-05-03 20:21:15 -07:00
parent 3d5d423694
commit 57ed66646d
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
7 changed files with 88 additions and 51 deletions

@ -115,9 +115,11 @@ export function parquetMetadata(arrayBuffer) {
field_id: field.field_9,
logical_type: logicalType(field.field_10),
}))
// @ts-expect-error get types by column index
const columnTypes = schema.map(e => e.type).filter(e => e)
const num_rows = metadata.field_3
const row_groups = metadata.field_4.map((/** @type {any} */ rowGroup) => ({
columns: rowGroup.field_1.map((/** @type {any} */ column) => ({
columns: rowGroup.field_1.map((/** @type {any} */ column, /** @type {number} */ columnIndex) => ({
file_path: decode(column.field_1),
file_offset: column.field_2,
meta_data: column.field_3 && {
@ -132,16 +134,7 @@ export function parquetMetadata(arrayBuffer) {
data_page_offset: column.field_3.field_9,
index_page_offset: column.field_3.field_10,
dictionary_page_offset: column.field_3.field_11,
statistics: column.field_3.field_12 && {
max: decode(column.field_3.field_12.field_1),
min: decode(column.field_3.field_12.field_2),
null_count: column.field_3.field_12.field_3,
distinct_count: column.field_3.field_12.field_4,
max_value: decode(column.field_3.field_12.field_5),
min_value: decode(column.field_3.field_12.field_6),
is_max_value_exact: column.field_3.field_12.field_7,
is_min_value_exact: column.field_3.field_12.field_8,
},
statistics: columnStats(column.field_3.field_12, columnTypes[columnIndex]),
encoding_stats: column.field_3.field_13?.map((/** @type {any} */ encodingStat) => ({
page_type: encodingStat.field_1,
encoding: Encoding[encodingStat.field_2],
@ -228,3 +221,45 @@ function logicalType(logicalType) {
return logicalType
}
}
/**
* Convert column statistics based on column type.
*
* @param {any} stats
* @param {import("./types.d.ts").ParquetType} type
* @returns {import("./types.d.ts").Statistics}
*/
function columnStats(stats, type) {
function convert(/** @type {Uint8Array} */ value) {
if (value === undefined) return value
if (type === 'BOOLEAN') return value[0] === 1
if (type === 'BYTE_ARRAY') return new TextDecoder().decode(value)
if (type === 'INT32') {
const view = new DataView(value.buffer, value.byteOffset, value.byteLength)
return view.getInt32(0, true)
}
if (type === 'INT64') {
const view = new DataView(value.buffer, value.byteOffset, value.byteLength)
return view.getBigInt64(0, true)
}
if (type === 'FLOAT') {
const view = new DataView(value.buffer, value.byteOffset, value.byteLength)
return view.getFloat32(0, true)
}
if (type === 'DOUBLE') {
const view = new DataView(value.buffer, value.byteOffset, value.byteLength)
return view.getFloat64(0, true)
}
return value
}
return stats && {
max: convert(stats.field_1),
min: convert(stats.field_2),
null_count: stats.field_3,
distinct_count: stats.field_4,
max_value: convert(stats.field_5),
min_value: convert(stats.field_6),
is_max_value_exact: stats.field_7,
is_min_value_exact: stats.field_8,
}
}

6
src/types.d.ts vendored

@ -192,9 +192,11 @@ interface KeyValue {
value?: string
}
type MinMaxType = bigint | boolean | number | string
export interface Statistics {
max?: string
min?: string
max?: MinMaxType
min?: MinMaxType
null_count?: bigint
distinct_count?: bigint
max_value?: string

@ -34,10 +34,10 @@
"num_values": 10,
"path_in_schema": ["int_map", "key_value", "value"],
"statistics": {
"max": "d\u0000\u0000\u0000",
"min": "\u0001\u0000\u0000\u0000",
"max_value": "d\u0000\u0000\u0000",
"min_value": "\u0001\u0000\u0000\u0000"
"max": 100,
"min": 1,
"max_value": 100,
"min_value": 1
},
"total_compressed_size": 60,
"total_uncompressed_size": 59,

@ -19,8 +19,8 @@
"long_col"
],
"statistics": {
"max_value": "\u0001\u0002\u0000\u0000\u0000\u0000\u0000\u0000",
"min_value": "\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000"
"max_value": 513,
"min_value": 1
},
"total_compressed_size": 1467,
"total_uncompressed_size": 4155,

@ -39,8 +39,8 @@
"num_values": 5,
"path_in_schema": ["b"],
"statistics": {
"max": "\u0005\u0000\u0000\u0000",
"min": "\u0001\u0000\u0000\u0000",
"max": 5,
"min": 1,
"null_count": 0
},
"total_compressed_size": 49,
@ -57,8 +57,8 @@
"num_values": 5,
"path_in_schema": ["c"],
"statistics": {
"max": "\u0000\u0000\u0000\u0000\u0000\u0000\u0014@",
"min": "\u0000\u0000\u0000\u0000\u0000\u0000\u0000@",
"max": 5,
"min": 2,
"null_count": 0
},
"total_compressed_size": 88,
@ -75,8 +75,8 @@
"num_values": 5,
"path_in_schema": ["d"],
"statistics": {
"max": "\u0001",
"min": "\u0000",
"max": true,
"min": false,
"null_count": 0
},
"total_compressed_size": 39,
@ -97,8 +97,8 @@
"element"
],
"statistics": {
"max": "\u0003\u0000\u0000\u0000",
"min": "\u0001\u0000\u0000\u0000",
"max": 3,
"min": 1,
"null_count": 2
},
"total_compressed_size": 78,

@ -21,8 +21,8 @@
"num_values": 1,
"path_in_schema": [ "ID" ],
"statistics": {
"max": "\b\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
"min": "\b\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
"max": 8,
"min": 8,
"null_count": 0
},
"total_compressed_size": 49,
@ -39,8 +39,8 @@
"num_values": 1,
"path_in_schema": [ "Int_Array", "list", "element" ],
"statistics": {
"max": "<22><><EFBFBD><EFBFBD>",
"min": "<22><><EFBFBD><EFBFBD>",
"max": -1,
"min": -1,
"null_count": 0
},
"total_compressed_size": 49,
@ -63,8 +63,8 @@
"element"
],
"statistics": {
"max": "<22><><EFBFBD><EFBFBD>",
"min": "<22><><EFBFBD><EFBFBD>",
"max": -1,
"min": -2,
"null_count": 1
},
"total_compressed_size": 55,
@ -99,8 +99,8 @@
"num_values": 1,
"path_in_schema": [ "Int_Map", "map", "value" ],
"statistics": {
"max": "<22><><EFBFBD><EFBFBD>",
"min": "<22><><EFBFBD><EFBFBD>",
"max": -1,
"min": -1,
"null_count": 0
},
"total_compressed_size": 49,
@ -147,8 +147,8 @@
"value"
],
"statistics": {
"max": "\u0001\u0000\u0000\u0000",
"min": "\u0001\u0000\u0000\u0000",
"max": 1,
"min": 1,
"null_count": 3
},
"total_compressed_size": 51,
@ -165,8 +165,8 @@
"num_values": 1,
"path_in_schema": [ "nested_Struct", "a" ],
"statistics": {
"max": "<22><><EFBFBD><EFBFBD>",
"min": "<22><><EFBFBD><EFBFBD>",
"max": -1,
"min": -1,
"null_count": 0
},
"total_compressed_size": 37,
@ -183,8 +183,8 @@
"num_values": 1,
"path_in_schema": [ "nested_Struct", "B", "list", "element" ],
"statistics": {
"max": "<22><><EFBFBD><EFBFBD>",
"min": "<22><><EFBFBD><EFBFBD>",
"max": -1,
"min": -1,
"null_count": 0
},
"total_compressed_size": 49,
@ -210,8 +210,8 @@
"e"
],
"statistics": {
"max": "<22><><EFBFBD><EFBFBD>",
"min": "<22><><EFBFBD><EFBFBD>",
"max": -1,
"min": -1,
"null_count": 0
},
"total_compressed_size": 51,

@ -32,10 +32,10 @@
"num_values": 10,
"path_in_schema": ["numbers"],
"statistics": {
"max": "\n\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
"min": "\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
"max_value": "\n\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
"min_value": "\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
"max": 10,
"min": 1,
"max_value": 10,
"min_value": 1,
"null_count": 0
},
"total_compressed_size": 146,
@ -66,10 +66,10 @@
"num_values": 5,
"path_in_schema": ["numbers"],
"statistics": {
"max": "\u000f\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
"min": "\u000b\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
"max_value": "\u000f\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
"min_value": "\u000b\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
"max": 15,
"min": 11,
"max_value": 15,
"min_value": 11,
"null_count": 0
},
"total_compressed_size": 120,