2024-02-11 22:33:56 +00:00
|
|
|
import { CompressionCodec, ConvertedType, FieldRepetitionType } from './constants.js'
|
2024-01-20 02:51:16 +00:00
|
|
|
import { schemaTree } from './schema.js'
|
2024-01-03 18:33:37 +00:00
|
|
|
import { deserializeTCompactProtocol } from './thrift.js'
|
2024-01-03 17:56:17 +00:00
|
|
|
|
|
|
|
|
/**
|
2024-01-15 19:10:26 +00:00
|
|
|
* Read parquet metadata from an async buffer.
|
2024-01-04 17:56:46 +00:00
|
|
|
*
|
2024-01-15 19:10:26 +00:00
|
|
|
* An AsyncBuffer is like an ArrayBuffer, but the slices are loaded
|
|
|
|
|
* asynchronously, possibly over the network.
|
|
|
|
|
*
|
2024-02-02 04:12:39 +00:00
|
|
|
* You must provide the byteLength of the buffer, typically from a HEAD request.
|
|
|
|
|
*
|
|
|
|
|
* In theory, you could use suffix-range requests to fetch the end of the file,
|
|
|
|
|
* and save a round trip. But in practice, this doesn't work because chrome
|
|
|
|
|
* deems suffix-range requests as a not-safe-listed header, and will require
|
|
|
|
|
* a pre-flight. So the byteLength is required.
|
|
|
|
|
*
|
2024-01-15 19:10:26 +00:00
|
|
|
* To make this efficient, we initially request the last 512kb of the file,
|
|
|
|
|
* which is likely to contain the metadata. If the metadata length exceeds the
|
|
|
|
|
* initial fetch, 512kb, we request the rest of the metadata from the AsyncBuffer.
|
|
|
|
|
*
|
|
|
|
|
* This ensures that we either make one 512kb initial request for the metadata,
|
2024-02-02 04:12:39 +00:00
|
|
|
* or a second request for up to the metadata size.
|
2024-01-15 19:10:26 +00:00
|
|
|
*
|
|
|
|
|
* @typedef {import("./types.d.ts").AsyncBuffer} AsyncBuffer
|
2024-01-04 19:11:00 +00:00
|
|
|
* @typedef {import("./types.d.ts").FileMetaData} FileMetaData
|
2024-01-15 19:10:26 +00:00
|
|
|
* @param {AsyncBuffer} asyncBuffer parquet file contents
|
|
|
|
|
* @param {number} initialFetchSize initial fetch size in bytes
|
2024-01-20 02:51:16 +00:00
|
|
|
* @returns {Promise<FileMetaData>} parquet metadata object
|
2024-01-15 19:10:26 +00:00
|
|
|
*/
|
|
|
|
|
export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 << 19 /* 512kb */) {
|
|
|
|
|
// fetch last bytes (footer) of the file
|
2024-02-02 04:12:39 +00:00
|
|
|
const footerOffset = Math.max(0, asyncBuffer.byteLength - initialFetchSize)
|
2024-01-15 19:10:26 +00:00
|
|
|
const footerBuffer = await asyncBuffer.slice(footerOffset)
|
2024-02-02 08:06:37 +00:00
|
|
|
|
|
|
|
|
// Check for parquet magic number "PAR1"
|
2024-01-15 19:10:26 +00:00
|
|
|
const footerView = new DataView(footerBuffer)
|
2024-02-02 08:06:37 +00:00
|
|
|
if (footerView.getUint32(footerBuffer.byteLength - 4, true) !== 0x31524150) {
|
|
|
|
|
throw new Error('parquet file invalid (footer != PAR1)')
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Parquet files store metadata at the end of the file
|
|
|
|
|
// Metadata length is 4 bytes before the last PAR1
|
2024-01-15 19:10:26 +00:00
|
|
|
const metadataLength = footerView.getUint32(footerBuffer.byteLength - 8, true)
|
2024-02-02 08:06:37 +00:00
|
|
|
if (metadataLength > asyncBuffer.byteLength - 8) {
|
|
|
|
|
throw new Error(`parquet metadata length ${metadataLength} exceeds available buffer ${asyncBuffer.byteLength - 8}`)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// check if metadata size fits inside the initial fetch
|
2024-01-15 19:10:26 +00:00
|
|
|
if (metadataLength + 8 > initialFetchSize) {
|
|
|
|
|
// fetch the rest of the metadata
|
|
|
|
|
const metadataOffset = asyncBuffer.byteLength - metadataLength - 8
|
|
|
|
|
const metadataBuffer = await asyncBuffer.slice(metadataOffset, footerOffset)
|
2024-02-02 04:12:39 +00:00
|
|
|
// combine initial fetch with the new slice
|
2024-01-15 19:10:26 +00:00
|
|
|
const combinedBuffer = new ArrayBuffer(metadataLength + 8)
|
|
|
|
|
const combinedView = new Uint8Array(combinedBuffer)
|
|
|
|
|
combinedView.set(new Uint8Array(metadataBuffer), 0)
|
|
|
|
|
combinedView.set(new Uint8Array(footerBuffer), footerOffset - metadataOffset)
|
|
|
|
|
return parquetMetadata(combinedBuffer)
|
|
|
|
|
} else {
|
|
|
|
|
// parse metadata from the footer
|
|
|
|
|
return parquetMetadata(footerBuffer)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Read parquet metadata from a buffer
|
|
|
|
|
*
|
2024-01-04 17:56:46 +00:00
|
|
|
* @param {ArrayBuffer} arrayBuffer parquet file contents
|
2024-01-20 02:51:16 +00:00
|
|
|
* @returns {FileMetaData} parquet metadata object
|
2024-01-03 17:56:17 +00:00
|
|
|
*/
|
2024-01-04 18:31:28 +00:00
|
|
|
export function parquetMetadata(arrayBuffer) {
|
2024-01-03 17:56:17 +00:00
|
|
|
// DataView for easier manipulation of the buffer
|
|
|
|
|
const view = new DataView(arrayBuffer)
|
|
|
|
|
|
|
|
|
|
// Validate footer magic number "PAR1"
|
|
|
|
|
if (view.byteLength < 8) {
|
|
|
|
|
throw new Error('parquet file is too short')
|
|
|
|
|
}
|
|
|
|
|
if (view.getUint32(view.byteLength - 4, true) !== 0x31524150) {
|
2024-02-02 08:06:37 +00:00
|
|
|
throw new Error('parquet file invalid (footer != PAR1)')
|
2024-01-03 17:56:17 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Parquet files store metadata at the end of the file
|
|
|
|
|
// Metadata length is 4 bytes before the last PAR1
|
|
|
|
|
const metadataLengthOffset = view.byteLength - 8
|
2024-01-15 19:10:26 +00:00
|
|
|
const metadataLength = view.getUint32(metadataLengthOffset, true)
|
2024-01-11 19:06:37 +00:00
|
|
|
if (metadataLength > view.byteLength - 8) {
|
2024-01-15 19:10:26 +00:00
|
|
|
// {metadata}, metadata_length, PAR1
|
|
|
|
|
throw new Error(`parquet metadata length ${metadataLength} exceeds available buffer ${view.byteLength - 8}`)
|
2024-01-11 19:06:37 +00:00
|
|
|
}
|
2024-01-03 17:56:17 +00:00
|
|
|
|
|
|
|
|
const metadataOffset = metadataLengthOffset - metadataLength
|
2024-02-09 21:44:35 +00:00
|
|
|
const { value: metadata } = deserializeTCompactProtocol(view.buffer, view.byteOffset + metadataOffset)
|
2024-01-03 17:56:17 +00:00
|
|
|
|
|
|
|
|
// Parse parquet metadata from thrift data
|
|
|
|
|
const version = metadata.field_1
|
2024-01-04 18:31:28 +00:00
|
|
|
const schema = metadata.field_2.map((/** @type {any} */ field) => ({
|
2024-01-03 17:56:17 +00:00
|
|
|
type: field.field_1,
|
|
|
|
|
type_length: field.field_2,
|
2024-02-11 22:33:56 +00:00
|
|
|
repetition_type: FieldRepetitionType[field.field_3],
|
2024-01-03 17:56:17 +00:00
|
|
|
name: field.field_4,
|
|
|
|
|
num_children: field.field_5,
|
2024-02-11 22:33:56 +00:00
|
|
|
converted_type: ConvertedType[field.field_6],
|
2024-01-03 17:56:17 +00:00
|
|
|
scale: field.field_7,
|
|
|
|
|
precision: field.field_8,
|
|
|
|
|
field_id: field.field_9,
|
|
|
|
|
}))
|
|
|
|
|
const num_rows = metadata.field_3
|
2024-01-04 18:31:28 +00:00
|
|
|
const row_groups = metadata.field_4.map((/** @type {any} */ rowGroup) => ({
|
|
|
|
|
columns: rowGroup.field_1.map((/** @type {any} */ column) => ({
|
2024-01-03 17:56:17 +00:00
|
|
|
file_path: column.field_1,
|
|
|
|
|
file_offset: column.field_2,
|
|
|
|
|
meta_data: column.field_3 && {
|
|
|
|
|
type: column.field_3.field_1,
|
|
|
|
|
encodings: column.field_3.field_2,
|
|
|
|
|
path_in_schema: column.field_3.field_3,
|
2024-02-11 22:33:56 +00:00
|
|
|
codec: CompressionCodec[column.field_3.field_4],
|
2024-01-03 17:56:17 +00:00
|
|
|
num_values: column.field_3.field_5,
|
|
|
|
|
total_uncompressed_size: column.field_3.field_6,
|
|
|
|
|
total_compressed_size: column.field_3.field_7,
|
|
|
|
|
key_value_metadata: column.field_3.field_8,
|
|
|
|
|
data_page_offset: column.field_3.field_9,
|
|
|
|
|
index_page_offset: column.field_3.field_10,
|
|
|
|
|
dictionary_page_offset: column.field_3.field_11,
|
|
|
|
|
statistics: column.field_3.field_12 && {
|
|
|
|
|
max: column.field_3.field_12.field_1,
|
|
|
|
|
min: column.field_3.field_12.field_2,
|
|
|
|
|
null_count: column.field_3.field_12.field_3,
|
|
|
|
|
distinct_count: column.field_3.field_12.field_4,
|
|
|
|
|
},
|
2024-01-04 18:31:28 +00:00
|
|
|
encoding_stats: column.field_3.field_13?.map((/** @type {any} */ encodingStat) => ({
|
2024-01-03 17:56:17 +00:00
|
|
|
page_type: encodingStat.field_1,
|
|
|
|
|
encoding: encodingStat.field_2,
|
|
|
|
|
count: encodingStat.field_3,
|
|
|
|
|
})),
|
|
|
|
|
},
|
|
|
|
|
})),
|
|
|
|
|
total_byte_size: rowGroup.field_2,
|
|
|
|
|
num_rows: rowGroup.field_3,
|
2024-01-04 18:31:28 +00:00
|
|
|
sorting_columns: rowGroup.field_4?.map((/** @type {any} */ sortingColumn) => ({
|
2024-01-03 17:56:17 +00:00
|
|
|
column_idx: sortingColumn.field_1,
|
|
|
|
|
descending: sortingColumn.field_2,
|
|
|
|
|
nulls_first: sortingColumn.field_3,
|
|
|
|
|
})),
|
|
|
|
|
}))
|
2024-01-04 18:31:28 +00:00
|
|
|
const key_value_metadata = metadata.field_5?.map((/** @type {any} */ keyValue) => ({
|
2024-01-03 17:56:17 +00:00
|
|
|
key: keyValue.field_1,
|
|
|
|
|
value: keyValue.field_2,
|
|
|
|
|
}))
|
|
|
|
|
const created_by = metadata.field_6
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
version,
|
|
|
|
|
schema,
|
|
|
|
|
num_rows,
|
|
|
|
|
row_groups,
|
|
|
|
|
key_value_metadata,
|
|
|
|
|
created_by,
|
2024-01-12 22:35:20 +00:00
|
|
|
metadata_length: metadataLength,
|
2024-01-03 17:56:17 +00:00
|
|
|
}
|
|
|
|
|
}
|
2024-01-20 02:51:16 +00:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Return a tree of schema elements from parquet metadata.
|
|
|
|
|
*
|
|
|
|
|
* @typedef {import("./types.d.ts").SchemaTree} SchemaTree
|
|
|
|
|
* @param {FileMetaData} metadata parquet metadata object
|
|
|
|
|
* @returns {SchemaTree} tree of schema elements
|
|
|
|
|
*/
|
|
|
|
|
export function parquetSchema(metadata) {
|
|
|
|
|
return schemaTree(metadata.schema, 0)
|
|
|
|
|
}
|