diff --git a/src/hyparquet.d.ts b/src/hyparquet.d.ts index 1a17c57..55582ff 100644 --- a/src/hyparquet.d.ts +++ b/src/hyparquet.d.ts @@ -29,12 +29,19 @@ export async function parquetRead(options: ParquetReadOptions): Promise * An AsyncBuffer is like an ArrayBuffer, but the slices are loaded * asynchronously, possibly over the network. * + * You must provide the byteLength of the buffer, typically from a HEAD request. + * + * In theory, you could use suffix-range requests to fetch the end of the file, + * and save a round trip. But in practice, this doesn't work because chrome + * deems suffix-range requests as a not-safe-listed header, and will require + * a pre-flight. So the byteLength is required. + * * To make this efficient, we initially request the last 512kb of the file, * which is likely to contain the metadata. If the metadata length exceeds the * initial fetch, 512kb, we request the rest of the metadata from the AsyncBuffer. * * This ensures that we either make one 512kb initial request for the metadata, - * or two requests for exactly the metadata size. + * or a second request for up to the metadata size. * * @param {AsyncBuffer} asyncBuffer parquet file contents * @param {number} initialFetchSize initial fetch size in bytes (default 512kb) diff --git a/src/metadata.js b/src/metadata.js index cc93458..937bbae 100644 --- a/src/metadata.js +++ b/src/metadata.js @@ -31,9 +31,21 @@ export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 << // fetch last bytes (footer) of the file const footerOffset = Math.max(0, asyncBuffer.byteLength - initialFetchSize) const footerBuffer = await asyncBuffer.slice(footerOffset) - // check if metadata size fits inside the initial fetch + + // Check for parquet magic number "PAR1" const footerView = new DataView(footerBuffer) + if (footerView.getUint32(footerBuffer.byteLength - 4, true) !== 0x31524150) { + throw new Error('parquet file invalid (footer != PAR1)') + } + + // Parquet files store metadata at the end of the file + // Metadata length is 4 bytes before the last PAR1 const metadataLength = footerView.getUint32(footerBuffer.byteLength - 8, true) + if (metadataLength > asyncBuffer.byteLength - 8) { + throw new Error(`parquet metadata length ${metadataLength} exceeds available buffer ${asyncBuffer.byteLength - 8}`) + } + + // check if metadata size fits inside the initial fetch if (metadataLength + 8 > initialFetchSize) { // fetch the rest of the metadata const metadataOffset = asyncBuffer.byteLength - metadataLength - 8 @@ -65,7 +77,7 @@ export function parquetMetadata(arrayBuffer) { throw new Error('parquet file is too short') } if (view.getUint32(view.byteLength - 4, true) !== 0x31524150) { - throw new Error('parquet file invalid magic number') + throw new Error('parquet file invalid (footer != PAR1)') } // Parquet files store metadata at the end of the file diff --git a/test/metadata.test.js b/test/metadata.test.js index 9aa438d..2cb60f8 100644 --- a/test/metadata.test.js +++ b/test/metadata.test.js @@ -33,7 +33,14 @@ describe('parquetMetadata', () => { it('should throw an error for invalid magic number', () => { const arrayBuffer = new ArrayBuffer(8) - expect(() => parquetMetadata(arrayBuffer)).toThrow('parquet file invalid magic number') + expect(() => parquetMetadata(arrayBuffer)) + .toThrow('parquet file invalid (footer != PAR1)') + }) + + it('should throw an error for invalid metadata length', () => { + const { buffer } = new Uint8Array([255, 255, 255, 255, 80, 65, 82, 49]) + expect(() => parquetMetadata(buffer)) + .toThrow('parquet metadata length 4294967295 exceeds available buffer 0') }) }) @@ -50,6 +57,18 @@ describe('parquetMetadataAsync', () => { const result = await parquetMetadataAsync(asyncBuffer, 1609) expect(toJson(result)).containSubset(rowgroupsMetadata) }) + + it('should throw an error for invalid magic number', () => { + const { buffer } = new Uint8Array([255, 255, 255, 255, 255, 255, 255, 255]) + expect(parquetMetadataAsync(buffer)).rejects + .toThrow('parquet file invalid (footer != PAR1)') + }) + + it('should throw an error for invalid metadata length', () => { + const { buffer } = new Uint8Array([255, 255, 255, 255, 80, 65, 82, 49]) + expect(parquetMetadataAsync(buffer)).rejects + .toThrow('parquet metadata length 4294967295 exceeds available buffer 0') + }) }) // Parquet v1 from DuckDB