mirror of
https://github.com/asadbek064/hyparquet.git
synced 2026-01-09 20:46:37 +00:00
Convert statistics based on column type
This commit is contained in:
parent
3d5d423694
commit
57ed66646d
@ -115,9 +115,11 @@ export function parquetMetadata(arrayBuffer) {
|
||||
field_id: field.field_9,
|
||||
logical_type: logicalType(field.field_10),
|
||||
}))
|
||||
// @ts-expect-error get types by column index
|
||||
const columnTypes = schema.map(e => e.type).filter(e => e)
|
||||
const num_rows = metadata.field_3
|
||||
const row_groups = metadata.field_4.map((/** @type {any} */ rowGroup) => ({
|
||||
columns: rowGroup.field_1.map((/** @type {any} */ column) => ({
|
||||
columns: rowGroup.field_1.map((/** @type {any} */ column, /** @type {number} */ columnIndex) => ({
|
||||
file_path: decode(column.field_1),
|
||||
file_offset: column.field_2,
|
||||
meta_data: column.field_3 && {
|
||||
@ -132,16 +134,7 @@ export function parquetMetadata(arrayBuffer) {
|
||||
data_page_offset: column.field_3.field_9,
|
||||
index_page_offset: column.field_3.field_10,
|
||||
dictionary_page_offset: column.field_3.field_11,
|
||||
statistics: column.field_3.field_12 && {
|
||||
max: decode(column.field_3.field_12.field_1),
|
||||
min: decode(column.field_3.field_12.field_2),
|
||||
null_count: column.field_3.field_12.field_3,
|
||||
distinct_count: column.field_3.field_12.field_4,
|
||||
max_value: decode(column.field_3.field_12.field_5),
|
||||
min_value: decode(column.field_3.field_12.field_6),
|
||||
is_max_value_exact: column.field_3.field_12.field_7,
|
||||
is_min_value_exact: column.field_3.field_12.field_8,
|
||||
},
|
||||
statistics: columnStats(column.field_3.field_12, columnTypes[columnIndex]),
|
||||
encoding_stats: column.field_3.field_13?.map((/** @type {any} */ encodingStat) => ({
|
||||
page_type: encodingStat.field_1,
|
||||
encoding: Encoding[encodingStat.field_2],
|
||||
@ -228,3 +221,45 @@ function logicalType(logicalType) {
|
||||
return logicalType
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert column statistics based on column type.
|
||||
*
|
||||
* @param {any} stats
|
||||
* @param {import("./types.d.ts").ParquetType} type
|
||||
* @returns {import("./types.d.ts").Statistics}
|
||||
*/
|
||||
function columnStats(stats, type) {
|
||||
function convert(/** @type {Uint8Array} */ value) {
|
||||
if (value === undefined) return value
|
||||
if (type === 'BOOLEAN') return value[0] === 1
|
||||
if (type === 'BYTE_ARRAY') return new TextDecoder().decode(value)
|
||||
if (type === 'INT32') {
|
||||
const view = new DataView(value.buffer, value.byteOffset, value.byteLength)
|
||||
return view.getInt32(0, true)
|
||||
}
|
||||
if (type === 'INT64') {
|
||||
const view = new DataView(value.buffer, value.byteOffset, value.byteLength)
|
||||
return view.getBigInt64(0, true)
|
||||
}
|
||||
if (type === 'FLOAT') {
|
||||
const view = new DataView(value.buffer, value.byteOffset, value.byteLength)
|
||||
return view.getFloat32(0, true)
|
||||
}
|
||||
if (type === 'DOUBLE') {
|
||||
const view = new DataView(value.buffer, value.byteOffset, value.byteLength)
|
||||
return view.getFloat64(0, true)
|
||||
}
|
||||
return value
|
||||
}
|
||||
return stats && {
|
||||
max: convert(stats.field_1),
|
||||
min: convert(stats.field_2),
|
||||
null_count: stats.field_3,
|
||||
distinct_count: stats.field_4,
|
||||
max_value: convert(stats.field_5),
|
||||
min_value: convert(stats.field_6),
|
||||
is_max_value_exact: stats.field_7,
|
||||
is_min_value_exact: stats.field_8,
|
||||
}
|
||||
}
|
||||
|
||||
6
src/types.d.ts
vendored
6
src/types.d.ts
vendored
@ -192,9 +192,11 @@ interface KeyValue {
|
||||
value?: string
|
||||
}
|
||||
|
||||
type MinMaxType = bigint | boolean | number | string
|
||||
|
||||
export interface Statistics {
|
||||
max?: string
|
||||
min?: string
|
||||
max?: MinMaxType
|
||||
min?: MinMaxType
|
||||
null_count?: bigint
|
||||
distinct_count?: bigint
|
||||
max_value?: string
|
||||
|
||||
@ -34,10 +34,10 @@
|
||||
"num_values": 10,
|
||||
"path_in_schema": ["int_map", "key_value", "value"],
|
||||
"statistics": {
|
||||
"max": "d\u0000\u0000\u0000",
|
||||
"min": "\u0001\u0000\u0000\u0000",
|
||||
"max_value": "d\u0000\u0000\u0000",
|
||||
"min_value": "\u0001\u0000\u0000\u0000"
|
||||
"max": 100,
|
||||
"min": 1,
|
||||
"max_value": 100,
|
||||
"min_value": 1
|
||||
},
|
||||
"total_compressed_size": 60,
|
||||
"total_uncompressed_size": 59,
|
||||
|
||||
@ -19,8 +19,8 @@
|
||||
"long_col"
|
||||
],
|
||||
"statistics": {
|
||||
"max_value": "\u0001\u0002\u0000\u0000\u0000\u0000\u0000\u0000",
|
||||
"min_value": "\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000"
|
||||
"max_value": 513,
|
||||
"min_value": 1
|
||||
},
|
||||
"total_compressed_size": 1467,
|
||||
"total_uncompressed_size": 4155,
|
||||
|
||||
@ -39,8 +39,8 @@
|
||||
"num_values": 5,
|
||||
"path_in_schema": ["b"],
|
||||
"statistics": {
|
||||
"max": "\u0005\u0000\u0000\u0000",
|
||||
"min": "\u0001\u0000\u0000\u0000",
|
||||
"max": 5,
|
||||
"min": 1,
|
||||
"null_count": 0
|
||||
},
|
||||
"total_compressed_size": 49,
|
||||
@ -57,8 +57,8 @@
|
||||
"num_values": 5,
|
||||
"path_in_schema": ["c"],
|
||||
"statistics": {
|
||||
"max": "\u0000\u0000\u0000\u0000\u0000\u0000\u0014@",
|
||||
"min": "\u0000\u0000\u0000\u0000\u0000\u0000\u0000@",
|
||||
"max": 5,
|
||||
"min": 2,
|
||||
"null_count": 0
|
||||
},
|
||||
"total_compressed_size": 88,
|
||||
@ -75,8 +75,8 @@
|
||||
"num_values": 5,
|
||||
"path_in_schema": ["d"],
|
||||
"statistics": {
|
||||
"max": "\u0001",
|
||||
"min": "\u0000",
|
||||
"max": true,
|
||||
"min": false,
|
||||
"null_count": 0
|
||||
},
|
||||
"total_compressed_size": 39,
|
||||
@ -97,8 +97,8 @@
|
||||
"element"
|
||||
],
|
||||
"statistics": {
|
||||
"max": "\u0003\u0000\u0000\u0000",
|
||||
"min": "\u0001\u0000\u0000\u0000",
|
||||
"max": 3,
|
||||
"min": 1,
|
||||
"null_count": 2
|
||||
},
|
||||
"total_compressed_size": 78,
|
||||
|
||||
@ -21,8 +21,8 @@
|
||||
"num_values": 1,
|
||||
"path_in_schema": [ "ID" ],
|
||||
"statistics": {
|
||||
"max": "\b\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
|
||||
"min": "\b\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
|
||||
"max": 8,
|
||||
"min": 8,
|
||||
"null_count": 0
|
||||
},
|
||||
"total_compressed_size": 49,
|
||||
@ -39,8 +39,8 @@
|
||||
"num_values": 1,
|
||||
"path_in_schema": [ "Int_Array", "list", "element" ],
|
||||
"statistics": {
|
||||
"max": "<22><><EFBFBD><EFBFBD>",
|
||||
"min": "<22><><EFBFBD><EFBFBD>",
|
||||
"max": -1,
|
||||
"min": -1,
|
||||
"null_count": 0
|
||||
},
|
||||
"total_compressed_size": 49,
|
||||
@ -63,8 +63,8 @@
|
||||
"element"
|
||||
],
|
||||
"statistics": {
|
||||
"max": "<22><><EFBFBD><EFBFBD>",
|
||||
"min": "<22><><EFBFBD><EFBFBD>",
|
||||
"max": -1,
|
||||
"min": -2,
|
||||
"null_count": 1
|
||||
},
|
||||
"total_compressed_size": 55,
|
||||
@ -99,8 +99,8 @@
|
||||
"num_values": 1,
|
||||
"path_in_schema": [ "Int_Map", "map", "value" ],
|
||||
"statistics": {
|
||||
"max": "<22><><EFBFBD><EFBFBD>",
|
||||
"min": "<22><><EFBFBD><EFBFBD>",
|
||||
"max": -1,
|
||||
"min": -1,
|
||||
"null_count": 0
|
||||
},
|
||||
"total_compressed_size": 49,
|
||||
@ -147,8 +147,8 @@
|
||||
"value"
|
||||
],
|
||||
"statistics": {
|
||||
"max": "\u0001\u0000\u0000\u0000",
|
||||
"min": "\u0001\u0000\u0000\u0000",
|
||||
"max": 1,
|
||||
"min": 1,
|
||||
"null_count": 3
|
||||
},
|
||||
"total_compressed_size": 51,
|
||||
@ -165,8 +165,8 @@
|
||||
"num_values": 1,
|
||||
"path_in_schema": [ "nested_Struct", "a" ],
|
||||
"statistics": {
|
||||
"max": "<22><><EFBFBD><EFBFBD>",
|
||||
"min": "<22><><EFBFBD><EFBFBD>",
|
||||
"max": -1,
|
||||
"min": -1,
|
||||
"null_count": 0
|
||||
},
|
||||
"total_compressed_size": 37,
|
||||
@ -183,8 +183,8 @@
|
||||
"num_values": 1,
|
||||
"path_in_schema": [ "nested_Struct", "B", "list", "element" ],
|
||||
"statistics": {
|
||||
"max": "<22><><EFBFBD><EFBFBD>",
|
||||
"min": "<22><><EFBFBD><EFBFBD>",
|
||||
"max": -1,
|
||||
"min": -1,
|
||||
"null_count": 0
|
||||
},
|
||||
"total_compressed_size": 49,
|
||||
@ -210,8 +210,8 @@
|
||||
"e"
|
||||
],
|
||||
"statistics": {
|
||||
"max": "<22><><EFBFBD><EFBFBD>",
|
||||
"min": "<22><><EFBFBD><EFBFBD>",
|
||||
"max": -1,
|
||||
"min": -1,
|
||||
"null_count": 0
|
||||
},
|
||||
"total_compressed_size": 51,
|
||||
|
||||
@ -32,10 +32,10 @@
|
||||
"num_values": 10,
|
||||
"path_in_schema": ["numbers"],
|
||||
"statistics": {
|
||||
"max": "\n\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
|
||||
"min": "\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
|
||||
"max_value": "\n\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
|
||||
"min_value": "\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
|
||||
"max": 10,
|
||||
"min": 1,
|
||||
"max_value": 10,
|
||||
"min_value": 1,
|
||||
"null_count": 0
|
||||
},
|
||||
"total_compressed_size": 146,
|
||||
@ -66,10 +66,10 @@
|
||||
"num_values": 5,
|
||||
"path_in_schema": ["numbers"],
|
||||
"statistics": {
|
||||
"max": "\u000f\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
|
||||
"min": "\u000b\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
|
||||
"max_value": "\u000f\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
|
||||
"min_value": "\u000b\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
|
||||
"max": 15,
|
||||
"min": 11,
|
||||
"max_value": 15,
|
||||
"min_value": 11,
|
||||
"null_count": 0
|
||||
},
|
||||
"total_compressed_size": 120,
|
||||
|
||||
Loading…
Reference in New Issue
Block a user