2024-01-03 18:33:37 +00:00
|
|
|
import { deserializeTCompactProtocol } from './thrift.js'
|
|
|
|
|
import type { FileMetaData, SchemaElement } from './types.ts'
|
2024-01-03 17:56:17 +00:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Read parquet header, metadata, and schema information from a file
|
2024-01-04 17:56:46 +00:00
|
|
|
*
|
|
|
|
|
* @param {ArrayBuffer} arrayBuffer parquet file contents
|
|
|
|
|
* @returns {FileMetaData} metadata object
|
2024-01-03 17:56:17 +00:00
|
|
|
*/
|
|
|
|
|
export function parquetMetadata(arrayBuffer: ArrayBuffer): FileMetaData {
|
|
|
|
|
// DataView for easier manipulation of the buffer
|
|
|
|
|
const view = new DataView(arrayBuffer)
|
|
|
|
|
|
|
|
|
|
// Validate footer magic number "PAR1"
|
|
|
|
|
if (view.byteLength < 8) {
|
|
|
|
|
throw new Error('parquet file is too short')
|
|
|
|
|
}
|
|
|
|
|
if (view.getUint32(view.byteLength - 4, true) !== 0x31524150) {
|
|
|
|
|
throw new Error('parquet file invalid magic number')
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Parquet files store metadata at the end of the file
|
|
|
|
|
// Metadata length is 4 bytes before the last PAR1
|
|
|
|
|
const metadataLengthOffset = view.byteLength - 8
|
|
|
|
|
const metadataLength = view.getUint32(view.byteLength - 8, true)
|
|
|
|
|
if (metadataLength <= 0 || metadataLength > metadataLengthOffset) {
|
|
|
|
|
throw new Error('parquet file invalid metadata length')
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const metadataOffset = metadataLengthOffset - metadataLength
|
|
|
|
|
const metadataBuffer = view.buffer.slice(metadataOffset, metadataLengthOffset)
|
|
|
|
|
const { value: metadata } = deserializeTCompactProtocol(metadataBuffer)
|
|
|
|
|
|
|
|
|
|
// Parse parquet metadata from thrift data
|
|
|
|
|
const version = metadata.field_1
|
|
|
|
|
const schema = metadata.field_2.map((field: any) => ({
|
|
|
|
|
type: field.field_1,
|
|
|
|
|
type_length: field.field_2,
|
|
|
|
|
repetition_type: field.field_3,
|
|
|
|
|
name: field.field_4,
|
|
|
|
|
num_children: field.field_5,
|
|
|
|
|
converted_type: field.field_6,
|
|
|
|
|
scale: field.field_7,
|
|
|
|
|
precision: field.field_8,
|
|
|
|
|
field_id: field.field_9,
|
|
|
|
|
}))
|
|
|
|
|
const num_rows = metadata.field_3
|
|
|
|
|
const row_groups = metadata.field_4.map((rowGroup: any) => ({
|
|
|
|
|
columns: rowGroup.field_1.map((column: any) => ({
|
|
|
|
|
file_path: column.field_1,
|
|
|
|
|
file_offset: column.field_2,
|
|
|
|
|
meta_data: column.field_3 && {
|
|
|
|
|
type: column.field_3.field_1,
|
|
|
|
|
encodings: column.field_3.field_2,
|
|
|
|
|
path_in_schema: column.field_3.field_3,
|
|
|
|
|
codec: column.field_3.field_4,
|
|
|
|
|
num_values: column.field_3.field_5,
|
|
|
|
|
total_uncompressed_size: column.field_3.field_6,
|
|
|
|
|
total_compressed_size: column.field_3.field_7,
|
|
|
|
|
key_value_metadata: column.field_3.field_8,
|
|
|
|
|
data_page_offset: column.field_3.field_9,
|
|
|
|
|
index_page_offset: column.field_3.field_10,
|
|
|
|
|
dictionary_page_offset: column.field_3.field_11,
|
|
|
|
|
statistics: column.field_3.field_12 && {
|
|
|
|
|
max: column.field_3.field_12.field_1,
|
|
|
|
|
min: column.field_3.field_12.field_2,
|
|
|
|
|
null_count: column.field_3.field_12.field_3,
|
|
|
|
|
distinct_count: column.field_3.field_12.field_4,
|
|
|
|
|
},
|
|
|
|
|
encoding_stats: column.field_3.field_13?.map((encodingStat: any) => ({
|
|
|
|
|
page_type: encodingStat.field_1,
|
|
|
|
|
encoding: encodingStat.field_2,
|
|
|
|
|
count: encodingStat.field_3,
|
|
|
|
|
})),
|
|
|
|
|
},
|
|
|
|
|
})),
|
|
|
|
|
total_byte_size: rowGroup.field_2,
|
|
|
|
|
num_rows: rowGroup.field_3,
|
|
|
|
|
sorting_columns: rowGroup.field_4?.map((sortingColumn: any) => ({
|
|
|
|
|
column_idx: sortingColumn.field_1,
|
|
|
|
|
descending: sortingColumn.field_2,
|
|
|
|
|
nulls_first: sortingColumn.field_3,
|
|
|
|
|
})),
|
|
|
|
|
}))
|
|
|
|
|
const key_value_metadata = metadata.field_5?.map((keyValue: any) => ({
|
|
|
|
|
key: keyValue.field_1,
|
|
|
|
|
value: keyValue.field_2,
|
|
|
|
|
}))
|
|
|
|
|
const created_by = metadata.field_6
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
version,
|
|
|
|
|
schema,
|
|
|
|
|
num_rows,
|
|
|
|
|
row_groups,
|
|
|
|
|
key_value_metadata,
|
|
|
|
|
created_by,
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Get the schema element with the given name.
|
2024-01-04 17:56:46 +00:00
|
|
|
*
|
|
|
|
|
* @param {SchemaElement[]} schema parquet schema
|
|
|
|
|
* @param {string[]} name path to the element
|
|
|
|
|
* @returns {SchemaElement} schema element
|
2024-01-03 17:56:17 +00:00
|
|
|
*/
|
2024-01-04 17:56:46 +00:00
|
|
|
export function schemaElement(schema: SchemaElement[], name: string[]): SchemaElement {
|
2024-01-03 17:56:17 +00:00
|
|
|
function key(name: string[]) { return name.join('.') }
|
|
|
|
|
const schemaElementByName = new Map(schema.map(se => [se.name, se]))
|
|
|
|
|
const element = schemaElementByName.get(key(name))
|
|
|
|
|
if (!element) {
|
|
|
|
|
throw new Error(`schema element not found: ${name}`)
|
|
|
|
|
}
|
|
|
|
|
return element
|
|
|
|
|
}
|
2024-01-03 18:33:37 +00:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Replace bigints with numbers.
|
2024-01-04 17:27:47 +00:00
|
|
|
* When parsing parquet files, bigints are used to represent 64-bit integers.
|
|
|
|
|
* However, JSON does not support bigints, so it's helpful to convert to numbers.
|
2024-01-04 17:56:46 +00:00
|
|
|
*
|
|
|
|
|
* @param {unknown} obj object to convert
|
|
|
|
|
* @returns {unknown} converted object
|
2024-01-03 18:33:37 +00:00
|
|
|
*/
|
2024-01-04 18:06:50 +00:00
|
|
|
export function toJson(obj: any): unknown {
|
2024-01-03 18:33:37 +00:00
|
|
|
if (typeof obj === 'bigint') {
|
|
|
|
|
return Number(obj)
|
|
|
|
|
} else if (Array.isArray(obj)) {
|
2024-01-04 17:27:47 +00:00
|
|
|
return obj.map(toJson)
|
2024-01-04 18:06:50 +00:00
|
|
|
} else if (obj instanceof Object) {
|
|
|
|
|
/** @type {Record<string, unknown>} */
|
|
|
|
|
const newObj: Record<string, unknown> = {}
|
2024-01-03 18:33:37 +00:00
|
|
|
for (const key of Object.keys(obj)) {
|
2024-01-04 17:27:47 +00:00
|
|
|
newObj[key] = toJson(obj[key])
|
2024-01-03 18:33:37 +00:00
|
|
|
}
|
|
|
|
|
return newObj
|
|
|
|
|
} else {
|
|
|
|
|
return obj
|
|
|
|
|
}
|
|
|
|
|
}
|