Error handling for missing byteLength

This commit is contained in:
Kenny Daniel 2024-04-17 17:45:15 -07:00
parent d0213bf0f1
commit 4b7d864319
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
5 changed files with 21 additions and 13 deletions

@ -3,21 +3,22 @@ import { snappyUncompressor } from 'hysnappy'
import { parquetRead } from './src/hyparquet.js'
const url = 'https://huggingface.co/datasets/wikimedia/wikipedia/resolve/main/20231101.en/train-00000-of-00041.parquet'
const filename = 'example.parquet'
// download test parquet file if needed
let stat = await fs.stat('example.parquet').catch(() => undefined)
let stat = await fs.stat(filename).catch(() => undefined)
if (!stat) {
console.log('downloading ' + url)
const res = await fetch(url)
if (!res.ok) throw new Error(res.statusText)
// write to file async
const writeStream = createWriteStream('example.parquet')
const writeStream = createWriteStream(filename)
for await (const chunk of res.body) {
writeStream.write(chunk)
}
writeStream.end()
console.log('downloaded example.parquet')
stat = await fs.stat('example.parquet').catch(() => undefined)
stat = await fs.stat(filename).catch(() => undefined)
}
// asyncBuffer
@ -25,7 +26,7 @@ const file = {
byteLength: stat.size,
async slice(start, end) {
// read file slice
const readStream = createReadStream('example.parquet', { start, end })
const readStream = createReadStream(filename, { start, end })
const buffer = await readStreamToArrayBuffer(readStream)
return new Uint8Array(buffer).buffer
},

@ -20,7 +20,7 @@
"type": "module",
"types": "src/hyparquet.d.ts",
"scripts": {
"coverage": "vitest run --coverage",
"coverage": "vitest run --coverage --coverage.include=src",
"demo": "http-server -o",
"lint": "eslint .",
"test": "vitest run",
@ -28,14 +28,14 @@
},
"devDependencies": {
"@types/node": "20.12.7",
"@typescript-eslint/eslint-plugin": "7.6.0",
"@vitest/coverage-v8": "1.4.0",
"@typescript-eslint/eslint-plugin": "7.7.0",
"@vitest/coverage-v8": "1.5.0",
"eslint": "8.57.0",
"eslint-plugin-import": "2.29.1",
"eslint-plugin-jsdoc": "48.2.3",
"http-server": "14.1.1",
"hysnappy": "0.3.0",
"typescript": "5.4.5",
"vitest": "1.4.0"
"vitest": "1.5.0"
}
}

@ -29,11 +29,12 @@ import { deserializeTCompactProtocol } from './thrift.js'
* @returns {Promise<FileMetaData>} parquet metadata object
*/
export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 << 19 /* 512kb */) {
if (!asyncBuffer) throw new Error('parquet asyncBuffer is required')
if (!asyncBuffer) throw new Error('parquet file is required')
if (!(asyncBuffer.byteLength >= 0)) throw new Error('parquet file byteLength is required')
// fetch last bytes (footer) of the file
const footerOffset = Math.max(0, asyncBuffer.byteLength - initialFetchSize)
const footerBuffer = await asyncBuffer.slice(footerOffset)
const footerBuffer = await asyncBuffer.slice(footerOffset, asyncBuffer.byteLength)
// Check for parquet magic number "PAR1"
const footerView = new DataView(footerBuffer)
@ -72,7 +73,7 @@ export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 <<
* @returns {FileMetaData} parquet metadata object
*/
export function parquetMetadata(arrayBuffer) {
if (!arrayBuffer) throw new Error('parquet arrayBuffer is required')
if (!arrayBuffer) throw new Error('parquet file is required')
const view = new DataView(arrayBuffer)
// Validate footer magic number "PAR1"

@ -19,7 +19,7 @@ describe('parquetMetadata', () => {
it('throws for arrayBuffer undefined', () => {
// @ts-expect-error testing invalid input
expect(() => parquetMetadata(undefined)).toThrow('parquet arrayBuffer is required')
expect(() => parquetMetadata(undefined)).toThrow('parquet file is required')
})
it('throws for a too short file', () => {
@ -64,7 +64,7 @@ describe('parquetMetadataAsync', () => {
it('throws for asyncBuffer undefined', async () => {
const arrayBuffer = undefined
await expect(parquetMetadataAsync(arrayBuffer)).rejects
.toThrow('parquet asyncBuffer is required')
.toThrow('parquet file is required')
})
it('throws for invalid magic number', async () => {

@ -10,6 +10,12 @@ describe('parquetRead', () => {
.rejects.toThrow('parquet file is required')
})
it('throws error for undefined byteLength', async () => {
const file = { byteLength: undefined, slice: () => new ArrayBuffer(0) }
await expect(parquetRead({ file }))
.rejects.toThrow('parquet file byteLength is required')
})
it('filter by row', async () => {
const file = fileToAsyncBuffer('test/files/rowgroups.parquet')
await parquetRead({