From 800b39b57537faa56aa11c69af669fbe261d5f4e Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Thu, 1 Feb 2024 20:12:39 -0800 Subject: [PATCH] Fix initial fetch size larger > file size --- src/column.js | 5 +++-- src/metadata.js | 13 ++++++++++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/column.js b/src/column.js index 3d289a0..0725da2 100644 --- a/src/column.js +++ b/src/column.js @@ -45,6 +45,7 @@ export function readColumn(arrayBuffer, rowGroup, columnMetadata, schema) { columnOffset + byteOffset + header.compressed_page_size )) // decompress bytes + /** @type {Uint8Array | undefined} */ let page const uncompressed_page_size = Number(header.uncompressed_page_size) const { codec } = columnMetadata @@ -56,8 +57,8 @@ export function readColumn(arrayBuffer, rowGroup, columnMetadata, schema) { } else if (codec === CompressionCodec.LZO) { throw new Error('parquet lzo compression not supported') } - if (!page || page.length !== uncompressed_page_size) { - throw new Error('parquet decompressed page size does not match header') + if (page?.length !== uncompressed_page_size) { + throw new Error(`parquet decompressed page length ${page?.length} does not match header ${uncompressed_page_size}`) } // parse page data by type diff --git a/src/metadata.js b/src/metadata.js index fc7ea3a..cc93458 100644 --- a/src/metadata.js +++ b/src/metadata.js @@ -7,12 +7,19 @@ import { deserializeTCompactProtocol } from './thrift.js' * An AsyncBuffer is like an ArrayBuffer, but the slices are loaded * asynchronously, possibly over the network. * + * You must provide the byteLength of the buffer, typically from a HEAD request. + * + * In theory, you could use suffix-range requests to fetch the end of the file, + * and save a round trip. But in practice, this doesn't work because chrome + * deems suffix-range requests as a not-safe-listed header, and will require + * a pre-flight. So the byteLength is required. + * * To make this efficient, we initially request the last 512kb of the file, * which is likely to contain the metadata. If the metadata length exceeds the * initial fetch, 512kb, we request the rest of the metadata from the AsyncBuffer. * * This ensures that we either make one 512kb initial request for the metadata, - * or two requests for exactly the metadata size. + * or a second request for up to the metadata size. * * @typedef {import("./types.d.ts").AsyncBuffer} AsyncBuffer * @typedef {import("./types.d.ts").FileMetaData} FileMetaData @@ -22,7 +29,7 @@ import { deserializeTCompactProtocol } from './thrift.js' */ export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 << 19 /* 512kb */) { // fetch last bytes (footer) of the file - const footerOffset = asyncBuffer.byteLength - initialFetchSize + const footerOffset = Math.max(0, asyncBuffer.byteLength - initialFetchSize) const footerBuffer = await asyncBuffer.slice(footerOffset) // check if metadata size fits inside the initial fetch const footerView = new DataView(footerBuffer) @@ -31,7 +38,7 @@ export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 << // fetch the rest of the metadata const metadataOffset = asyncBuffer.byteLength - metadataLength - 8 const metadataBuffer = await asyncBuffer.slice(metadataOffset, footerOffset) - // combine the buffers + // combine initial fetch with the new slice const combinedBuffer = new ArrayBuffer(metadataLength + 8) const combinedView = new Uint8Array(combinedBuffer) combinedView.set(new Uint8Array(metadataBuffer), 0)