mirror of
https://github.com/asadbek064/hyparquet.git
synced 2026-01-01 09:46:37 +00:00
Check for magic number before reading metadata length.
Also make sure that metadata length is available.
This commit is contained in:
parent
db26668f5f
commit
5623b67ad5
9
src/hyparquet.d.ts
vendored
9
src/hyparquet.d.ts
vendored
@ -29,12 +29,19 @@ export async function parquetRead(options: ParquetReadOptions): Promise<void>
|
||||
* An AsyncBuffer is like an ArrayBuffer, but the slices are loaded
|
||||
* asynchronously, possibly over the network.
|
||||
*
|
||||
* You must provide the byteLength of the buffer, typically from a HEAD request.
|
||||
*
|
||||
* In theory, you could use suffix-range requests to fetch the end of the file,
|
||||
* and save a round trip. But in practice, this doesn't work because chrome
|
||||
* deems suffix-range requests as a not-safe-listed header, and will require
|
||||
* a pre-flight. So the byteLength is required.
|
||||
*
|
||||
* To make this efficient, we initially request the last 512kb of the file,
|
||||
* which is likely to contain the metadata. If the metadata length exceeds the
|
||||
* initial fetch, 512kb, we request the rest of the metadata from the AsyncBuffer.
|
||||
*
|
||||
* This ensures that we either make one 512kb initial request for the metadata,
|
||||
* or two requests for exactly the metadata size.
|
||||
* or a second request for up to the metadata size.
|
||||
*
|
||||
* @param {AsyncBuffer} asyncBuffer parquet file contents
|
||||
* @param {number} initialFetchSize initial fetch size in bytes (default 512kb)
|
||||
|
||||
@ -31,9 +31,21 @@ export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 <<
|
||||
// fetch last bytes (footer) of the file
|
||||
const footerOffset = Math.max(0, asyncBuffer.byteLength - initialFetchSize)
|
||||
const footerBuffer = await asyncBuffer.slice(footerOffset)
|
||||
// check if metadata size fits inside the initial fetch
|
||||
|
||||
// Check for parquet magic number "PAR1"
|
||||
const footerView = new DataView(footerBuffer)
|
||||
if (footerView.getUint32(footerBuffer.byteLength - 4, true) !== 0x31524150) {
|
||||
throw new Error('parquet file invalid (footer != PAR1)')
|
||||
}
|
||||
|
||||
// Parquet files store metadata at the end of the file
|
||||
// Metadata length is 4 bytes before the last PAR1
|
||||
const metadataLength = footerView.getUint32(footerBuffer.byteLength - 8, true)
|
||||
if (metadataLength > asyncBuffer.byteLength - 8) {
|
||||
throw new Error(`parquet metadata length ${metadataLength} exceeds available buffer ${asyncBuffer.byteLength - 8}`)
|
||||
}
|
||||
|
||||
// check if metadata size fits inside the initial fetch
|
||||
if (metadataLength + 8 > initialFetchSize) {
|
||||
// fetch the rest of the metadata
|
||||
const metadataOffset = asyncBuffer.byteLength - metadataLength - 8
|
||||
@ -65,7 +77,7 @@ export function parquetMetadata(arrayBuffer) {
|
||||
throw new Error('parquet file is too short')
|
||||
}
|
||||
if (view.getUint32(view.byteLength - 4, true) !== 0x31524150) {
|
||||
throw new Error('parquet file invalid magic number')
|
||||
throw new Error('parquet file invalid (footer != PAR1)')
|
||||
}
|
||||
|
||||
// Parquet files store metadata at the end of the file
|
||||
|
||||
@ -33,7 +33,14 @@ describe('parquetMetadata', () => {
|
||||
|
||||
it('should throw an error for invalid magic number', () => {
|
||||
const arrayBuffer = new ArrayBuffer(8)
|
||||
expect(() => parquetMetadata(arrayBuffer)).toThrow('parquet file invalid magic number')
|
||||
expect(() => parquetMetadata(arrayBuffer))
|
||||
.toThrow('parquet file invalid (footer != PAR1)')
|
||||
})
|
||||
|
||||
it('should throw an error for invalid metadata length', () => {
|
||||
const { buffer } = new Uint8Array([255, 255, 255, 255, 80, 65, 82, 49])
|
||||
expect(() => parquetMetadata(buffer))
|
||||
.toThrow('parquet metadata length 4294967295 exceeds available buffer 0')
|
||||
})
|
||||
})
|
||||
|
||||
@ -50,6 +57,18 @@ describe('parquetMetadataAsync', () => {
|
||||
const result = await parquetMetadataAsync(asyncBuffer, 1609)
|
||||
expect(toJson(result)).containSubset(rowgroupsMetadata)
|
||||
})
|
||||
|
||||
it('should throw an error for invalid magic number', () => {
|
||||
const { buffer } = new Uint8Array([255, 255, 255, 255, 255, 255, 255, 255])
|
||||
expect(parquetMetadataAsync(buffer)).rejects
|
||||
.toThrow('parquet file invalid (footer != PAR1)')
|
||||
})
|
||||
|
||||
it('should throw an error for invalid metadata length', () => {
|
||||
const { buffer } = new Uint8Array([255, 255, 255, 255, 80, 65, 82, 49])
|
||||
expect(parquetMetadataAsync(buffer)).rejects
|
||||
.toThrow('parquet metadata length 4294967295 exceeds available buffer 0')
|
||||
})
|
||||
})
|
||||
|
||||
// Parquet v1 from DuckDB
|
||||
|
||||
Loading…
Reference in New Issue
Block a user