Error handling for undefined parquet file

This commit is contained in:
Kenny Daniel 2024-04-05 11:08:10 -07:00
parent e3b3ddafa7
commit 48dc10fd18
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
6 changed files with 37 additions and 12 deletions

@ -27,15 +27,15 @@
"typecheck": "tsc"
},
"devDependencies": {
"@types/node": "20.12.3",
"@types/node": "20.12.4",
"@typescript-eslint/eslint-plugin": "7.5.0",
"@vitest/coverage-v8": "1.4.0",
"eslint": "8.57.0",
"eslint-plugin-import": "2.29.1",
"eslint-plugin-jsdoc": "48.2.2",
"eslint-plugin-jsdoc": "48.2.3",
"http-server": "14.1.1",
"hysnappy": "0.3.0",
"typescript": "5.4.3",
"typescript": "5.4.4",
"vitest": "1.4.0"
}
}

@ -88,3 +88,5 @@ export function assembleObjects(
return output
}
// TODO: depends on prior def level

@ -29,6 +29,8 @@ import { deserializeTCompactProtocol } from './thrift.js'
* @returns {Promise<FileMetaData>} parquet metadata object
*/
export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 << 19 /* 512kb */) {
if (!asyncBuffer) throw new Error('parquet asyncBuffer is required')
// fetch last bytes (footer) of the file
const footerOffset = Math.max(0, asyncBuffer.byteLength - initialFetchSize)
const footerBuffer = await asyncBuffer.slice(footerOffset)
@ -64,12 +66,14 @@ export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 <<
}
/**
* Read parquet metadata from a buffer
* Read parquet metadata from a buffer synchronously.
*
* @param {ArrayBuffer} arrayBuffer parquet file contents
* @returns {FileMetaData} parquet metadata object
*/
export function parquetMetadata(arrayBuffer) {
if (!arrayBuffer) throw new Error('parquet arrayBuffer is required')
// DataView for easier manipulation of the buffer
const view = new DataView(arrayBuffer)

@ -30,6 +30,8 @@ import { getColumnName, isMapLike } from './schema.js'
* @returns {Promise<void>} resolves when all requested rows and columns are parsed
*/
export async function parquetRead(options) {
if (!options.file) throw new Error('parquet file is required')
// load metadata if not provided
options.metadata ||= await parquetMetadataAsync(options.file)
if (!options.metadata) throw new Error('parquet metadata not found')

@ -17,12 +17,17 @@ describe('parquetMetadata', () => {
})
})
it('should throw an error for a too short file', () => {
it('throws for arrayBuffer undefined', () => {
// @ts-expect-error testing invalid input
expect(() => parquetMetadata(undefined)).toThrow('parquet arrayBuffer is required')
})
it('throws for a too short file', () => {
const arrayBuffer = new ArrayBuffer(0)
expect(() => parquetMetadata(arrayBuffer)).toThrow('parquet file is too short')
})
it('should throw an error for invalid metadata length', () => {
it('throws for invalid metadata length', () => {
const arrayBuffer = new ArrayBuffer(12)
const view = new DataView(arrayBuffer)
view.setUint32(0, 0x31524150, true) // magic number PAR1
@ -32,13 +37,13 @@ describe('parquetMetadata', () => {
.toThrow('parquet metadata length 1000 exceeds available buffer 4')
})
it('should throw an error for invalid magic number', () => {
it('throws for invalid magic number', () => {
const arrayBuffer = new ArrayBuffer(8)
expect(() => parquetMetadata(arrayBuffer))
.toThrow('parquet file invalid (footer != PAR1)')
})
it('should throw an error for invalid metadata length', () => {
it('throws for invalid metadata length', () => {
const { buffer } = new Uint8Array([255, 255, 255, 255, 80, 65, 82, 49])
expect(() => parquetMetadata(buffer))
.toThrow('parquet metadata length 4294967295 exceeds available buffer 0')
@ -56,15 +61,21 @@ describe('parquetMetadataAsync', () => {
})
})
it('should throw an error for invalid magic number', () => {
it('throws for asyncBuffer undefined', async () => {
const arrayBuffer = undefined
await expect(parquetMetadataAsync(arrayBuffer)).rejects
.toThrow('parquet asyncBuffer is required')
})
it('throws for invalid magic number', async () => {
const { buffer } = new Uint8Array([255, 255, 255, 255, 255, 255, 255, 255])
expect(parquetMetadataAsync(buffer)).rejects
await expect(parquetMetadataAsync(buffer)).rejects
.toThrow('parquet file invalid (footer != PAR1)')
})
it('should throw an error for invalid metadata length', () => {
it('throws for invalid metadata length', async () => {
const { buffer } = new Uint8Array([255, 255, 255, 255, 80, 65, 82, 49])
expect(parquetMetadataAsync(buffer)).rejects
await expect(parquetMetadataAsync(buffer)).rejects
.toThrow('parquet metadata length 4294967295 exceeds available buffer 0')
})
})

@ -34,6 +34,12 @@ describe('parquetRead', () => {
})
})
it('throws reasonable error messages', async () => {
const file = undefined
await expect(parquetRead({ file }))
.rejects.toThrow('parquet file is required')
})
it('should read a single column from a file', async () => {
const asyncBuffer = fileToAsyncBuffer('test/files/datapage_v2.snappy.parquet')
await parquetRead({