hyparquet/src/metadata.js

125 lines
4.2 KiB
JavaScript
Raw Normal View History

2024-01-03 18:33:37 +00:00
import { deserializeTCompactProtocol } from './thrift.js'
2024-01-03 17:56:17 +00:00
/**
* Read parquet header, metadata, and schema information from a file
2024-01-04 17:56:46 +00:00
*
2024-01-04 18:31:28 +00:00
* @typedef {import("./types.js").FileMetaData} FileMetaData
2024-01-04 17:56:46 +00:00
* @param {ArrayBuffer} arrayBuffer parquet file contents
* @returns {FileMetaData} metadata object
2024-01-03 17:56:17 +00:00
*/
2024-01-04 18:31:28 +00:00
export function parquetMetadata(arrayBuffer) {
2024-01-03 17:56:17 +00:00
// DataView for easier manipulation of the buffer
const view = new DataView(arrayBuffer)
// Validate footer magic number "PAR1"
if (view.byteLength < 8) {
throw new Error('parquet file is too short')
}
if (view.getUint32(view.byteLength - 4, true) !== 0x31524150) {
throw new Error('parquet file invalid magic number')
}
// Parquet files store metadata at the end of the file
// Metadata length is 4 bytes before the last PAR1
const metadataLengthOffset = view.byteLength - 8
const metadataLength = view.getUint32(view.byteLength - 8, true)
if (metadataLength <= 0 || metadataLength > metadataLengthOffset) {
throw new Error('parquet file invalid metadata length')
}
const metadataOffset = metadataLengthOffset - metadataLength
const metadataBuffer = view.buffer.slice(metadataOffset, metadataLengthOffset)
const { value: metadata } = deserializeTCompactProtocol(metadataBuffer)
// Parse parquet metadata from thrift data
const version = metadata.field_1
2024-01-04 18:31:28 +00:00
const schema = metadata.field_2.map((/** @type {any} */ field) => ({
2024-01-03 17:56:17 +00:00
type: field.field_1,
type_length: field.field_2,
repetition_type: field.field_3,
name: field.field_4,
num_children: field.field_5,
converted_type: field.field_6,
scale: field.field_7,
precision: field.field_8,
field_id: field.field_9,
}))
const num_rows = metadata.field_3
2024-01-04 18:31:28 +00:00
const row_groups = metadata.field_4.map((/** @type {any} */ rowGroup) => ({
columns: rowGroup.field_1.map((/** @type {any} */ column) => ({
2024-01-03 17:56:17 +00:00
file_path: column.field_1,
file_offset: column.field_2,
meta_data: column.field_3 && {
type: column.field_3.field_1,
encodings: column.field_3.field_2,
path_in_schema: column.field_3.field_3,
codec: column.field_3.field_4,
num_values: column.field_3.field_5,
total_uncompressed_size: column.field_3.field_6,
total_compressed_size: column.field_3.field_7,
key_value_metadata: column.field_3.field_8,
data_page_offset: column.field_3.field_9,
index_page_offset: column.field_3.field_10,
dictionary_page_offset: column.field_3.field_11,
statistics: column.field_3.field_12 && {
max: column.field_3.field_12.field_1,
min: column.field_3.field_12.field_2,
null_count: column.field_3.field_12.field_3,
distinct_count: column.field_3.field_12.field_4,
},
2024-01-04 18:31:28 +00:00
encoding_stats: column.field_3.field_13?.map((/** @type {any} */ encodingStat) => ({
2024-01-03 17:56:17 +00:00
page_type: encodingStat.field_1,
encoding: encodingStat.field_2,
count: encodingStat.field_3,
})),
},
})),
total_byte_size: rowGroup.field_2,
num_rows: rowGroup.field_3,
2024-01-04 18:31:28 +00:00
sorting_columns: rowGroup.field_4?.map((/** @type {any} */ sortingColumn) => ({
2024-01-03 17:56:17 +00:00
column_idx: sortingColumn.field_1,
descending: sortingColumn.field_2,
nulls_first: sortingColumn.field_3,
})),
}))
2024-01-04 18:31:28 +00:00
const key_value_metadata = metadata.field_5?.map((/** @type {any} */ keyValue) => ({
2024-01-03 17:56:17 +00:00
key: keyValue.field_1,
value: keyValue.field_2,
}))
const created_by = metadata.field_6
return {
version,
schema,
num_rows,
row_groups,
key_value_metadata,
created_by,
}
}
2024-01-03 18:33:37 +00:00
/**
* Replace bigints with numbers.
2024-01-04 17:27:47 +00:00
* When parsing parquet files, bigints are used to represent 64-bit integers.
* However, JSON does not support bigints, so it's helpful to convert to numbers.
2024-01-04 17:56:46 +00:00
*
2024-01-04 18:31:28 +00:00
* @param {any} obj object to convert
2024-01-04 17:56:46 +00:00
* @returns {unknown} converted object
2024-01-03 18:33:37 +00:00
*/
2024-01-04 18:31:28 +00:00
export function toJson(obj) {
2024-01-03 18:33:37 +00:00
if (typeof obj === 'bigint') {
return Number(obj)
} else if (Array.isArray(obj)) {
2024-01-04 17:27:47 +00:00
return obj.map(toJson)
2024-01-04 18:06:50 +00:00
} else if (obj instanceof Object) {
/** @type {Record<string, unknown>} */
2024-01-04 18:31:28 +00:00
const newObj = {}
2024-01-03 18:33:37 +00:00
for (const key of Object.keys(obj)) {
2024-01-04 17:27:47 +00:00
newObj[key] = toJson(obj[key])
2024-01-03 18:33:37 +00:00
}
return newObj
} else {
return obj
}
}