Check for magic number before reading metadata length.

Also make sure that metadata length is available.
This commit is contained in:
Kenny Daniel 2024-02-02 00:06:37 -08:00
parent db26668f5f
commit 5623b67ad5
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
3 changed files with 42 additions and 4 deletions

9
src/hyparquet.d.ts vendored

@ -29,12 +29,19 @@ export async function parquetRead(options: ParquetReadOptions): Promise<void>
* An AsyncBuffer is like an ArrayBuffer, but the slices are loaded
* asynchronously, possibly over the network.
*
* You must provide the byteLength of the buffer, typically from a HEAD request.
*
* In theory, you could use suffix-range requests to fetch the end of the file,
* and save a round trip. But in practice, this doesn't work because chrome
* deems suffix-range requests as a not-safe-listed header, and will require
* a pre-flight. So the byteLength is required.
*
* To make this efficient, we initially request the last 512kb of the file,
* which is likely to contain the metadata. If the metadata length exceeds the
* initial fetch, 512kb, we request the rest of the metadata from the AsyncBuffer.
*
* This ensures that we either make one 512kb initial request for the metadata,
* or two requests for exactly the metadata size.
* or a second request for up to the metadata size.
*
* @param {AsyncBuffer} asyncBuffer parquet file contents
* @param {number} initialFetchSize initial fetch size in bytes (default 512kb)

@ -31,9 +31,21 @@ export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 <<
// fetch last bytes (footer) of the file
const footerOffset = Math.max(0, asyncBuffer.byteLength - initialFetchSize)
const footerBuffer = await asyncBuffer.slice(footerOffset)
// check if metadata size fits inside the initial fetch
// Check for parquet magic number "PAR1"
const footerView = new DataView(footerBuffer)
if (footerView.getUint32(footerBuffer.byteLength - 4, true) !== 0x31524150) {
throw new Error('parquet file invalid (footer != PAR1)')
}
// Parquet files store metadata at the end of the file
// Metadata length is 4 bytes before the last PAR1
const metadataLength = footerView.getUint32(footerBuffer.byteLength - 8, true)
if (metadataLength > asyncBuffer.byteLength - 8) {
throw new Error(`parquet metadata length ${metadataLength} exceeds available buffer ${asyncBuffer.byteLength - 8}`)
}
// check if metadata size fits inside the initial fetch
if (metadataLength + 8 > initialFetchSize) {
// fetch the rest of the metadata
const metadataOffset = asyncBuffer.byteLength - metadataLength - 8
@ -65,7 +77,7 @@ export function parquetMetadata(arrayBuffer) {
throw new Error('parquet file is too short')
}
if (view.getUint32(view.byteLength - 4, true) !== 0x31524150) {
throw new Error('parquet file invalid magic number')
throw new Error('parquet file invalid (footer != PAR1)')
}
// Parquet files store metadata at the end of the file

@ -33,7 +33,14 @@ describe('parquetMetadata', () => {
it('should throw an error for invalid magic number', () => {
const arrayBuffer = new ArrayBuffer(8)
expect(() => parquetMetadata(arrayBuffer)).toThrow('parquet file invalid magic number')
expect(() => parquetMetadata(arrayBuffer))
.toThrow('parquet file invalid (footer != PAR1)')
})
it('should throw an error for invalid metadata length', () => {
const { buffer } = new Uint8Array([255, 255, 255, 255, 80, 65, 82, 49])
expect(() => parquetMetadata(buffer))
.toThrow('parquet metadata length 4294967295 exceeds available buffer 0')
})
})
@ -50,6 +57,18 @@ describe('parquetMetadataAsync', () => {
const result = await parquetMetadataAsync(asyncBuffer, 1609)
expect(toJson(result)).containSubset(rowgroupsMetadata)
})
it('should throw an error for invalid magic number', () => {
const { buffer } = new Uint8Array([255, 255, 255, 255, 255, 255, 255, 255])
expect(parquetMetadataAsync(buffer)).rejects
.toThrow('parquet file invalid (footer != PAR1)')
})
it('should throw an error for invalid metadata length', () => {
const { buffer } = new Uint8Array([255, 255, 255, 255, 80, 65, 82, 49])
expect(parquetMetadataAsync(buffer)).rejects
.toThrow('parquet metadata length 4294967295 exceeds available buffer 0')
})
})
// Parquet v1 from DuckDB