Fix initial fetch size larger > file size

This commit is contained in:
Kenny Daniel 2024-02-01 20:12:39 -08:00
parent 63ea067379
commit 800b39b575
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
2 changed files with 13 additions and 5 deletions

@ -45,6 +45,7 @@ export function readColumn(arrayBuffer, rowGroup, columnMetadata, schema) {
columnOffset + byteOffset + header.compressed_page_size
))
// decompress bytes
/** @type {Uint8Array | undefined} */
let page
const uncompressed_page_size = Number(header.uncompressed_page_size)
const { codec } = columnMetadata
@ -56,8 +57,8 @@ export function readColumn(arrayBuffer, rowGroup, columnMetadata, schema) {
} else if (codec === CompressionCodec.LZO) {
throw new Error('parquet lzo compression not supported')
}
if (!page || page.length !== uncompressed_page_size) {
throw new Error('parquet decompressed page size does not match header')
if (page?.length !== uncompressed_page_size) {
throw new Error(`parquet decompressed page length ${page?.length} does not match header ${uncompressed_page_size}`)
}
// parse page data by type

@ -7,12 +7,19 @@ import { deserializeTCompactProtocol } from './thrift.js'
* An AsyncBuffer is like an ArrayBuffer, but the slices are loaded
* asynchronously, possibly over the network.
*
* You must provide the byteLength of the buffer, typically from a HEAD request.
*
* In theory, you could use suffix-range requests to fetch the end of the file,
* and save a round trip. But in practice, this doesn't work because chrome
* deems suffix-range requests as a not-safe-listed header, and will require
* a pre-flight. So the byteLength is required.
*
* To make this efficient, we initially request the last 512kb of the file,
* which is likely to contain the metadata. If the metadata length exceeds the
* initial fetch, 512kb, we request the rest of the metadata from the AsyncBuffer.
*
* This ensures that we either make one 512kb initial request for the metadata,
* or two requests for exactly the metadata size.
* or a second request for up to the metadata size.
*
* @typedef {import("./types.d.ts").AsyncBuffer} AsyncBuffer
* @typedef {import("./types.d.ts").FileMetaData} FileMetaData
@ -22,7 +29,7 @@ import { deserializeTCompactProtocol } from './thrift.js'
*/
export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 << 19 /* 512kb */) {
// fetch last bytes (footer) of the file
const footerOffset = asyncBuffer.byteLength - initialFetchSize
const footerOffset = Math.max(0, asyncBuffer.byteLength - initialFetchSize)
const footerBuffer = await asyncBuffer.slice(footerOffset)
// check if metadata size fits inside the initial fetch
const footerView = new DataView(footerBuffer)
@ -31,7 +38,7 @@ export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 <<
// fetch the rest of the metadata
const metadataOffset = asyncBuffer.byteLength - metadataLength - 8
const metadataBuffer = await asyncBuffer.slice(metadataOffset, footerOffset)
// combine the buffers
// combine initial fetch with the new slice
const combinedBuffer = new ArrayBuffer(metadataLength + 8)
const combinedView = new Uint8Array(combinedBuffer)
combinedView.set(new Uint8Array(metadataBuffer), 0)