Better error messages

This commit is contained in:
Kenny Daniel 2025-03-04 09:38:39 -08:00
parent c70d5c2732
commit 2456cdc85f
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
8 changed files with 31 additions and 18 deletions

@ -33,6 +33,8 @@ Hyparquet aims to be the world's most compliant parquet parser. And it runs in t
## Why hyparquet?
Parquet is widely used in data engineering and data science for its efficient storage and processing of large datasets. What if you could use parquet files directly in the browser, without needing a server or backend infrastructure? That's what hyparquet enables.
Existing JavaScript-based parquet readers (like [parquetjs](https://github.com/ironSource/parquetjs)) are no longer actively maintained, may not support streaming or in-browser processing efficiently, and often rely on dependencies that can inflate your bundle size.
Hyparquet is actively maintained and designed with modern web usage in mind.
@ -40,8 +42,8 @@ Hyparquet is actively maintained and designed with modern web usage in mind.
Check out a minimal parquet viewer demo that shows how to integrate hyparquet into a react web application using [HighTable](https://github.com/hyparam/hightable).
- **Live Demo**: [https://hyparam.github.io/hyperparam-cli/apps/hyparquet-demo/](https://hyparam.github.io/hyperparam-cli/apps/hyparquet-demo/)
- **Source Code**: [https://github.com/hyparam/hyperparam-cli/tree/master/apps/hyparquet-demo](https://github.com/hyparam/hyperparam-cli/tree/master/apps/hyparquet-demo)
- **Live Demo**: [https://hyparam.github.io/demos/hyparquet/](https://hyparam.github.io/demos/hyparquet/)
- **Demo Source Code**: [https://github.com/hyparam/demos/tree/master/hyparquet](https://github.com/hyparam/demos/tree/master/hyparquet)
## Quick Start
@ -86,11 +88,17 @@ You can read just the metadata, including schema and data statistics using the `
To load parquet data in the browser from a remote server using `fetch`:
```javascript
import { parquetMetadata } from 'hyparquet'
import { parquetMetadata, parquetSchema } from 'hyparquet'
const res = await fetch(url)
const arrayBuffer = await res.arrayBuffer()
const metadata = parquetMetadata(arrayBuffer)
// Get total number of rows (convert bigint to number)
const numRows = Number(metadata.num_rows)
// Get nested table schema
const schema = parquetSchema(metadata)
// Get top-level column header names
const columnNames = schema.children.map(e => e.element.name)
```
### AsyncBuffer

@ -28,17 +28,18 @@
"build:types": "tsc -p ./tsconfig.build.json",
"coverage": "vitest run --coverage --coverage.include=src",
"lint": "eslint",
"lint:fix": "eslint --fix",
"prepare": "npm run build:types",
"test": "vitest run"
},
"devDependencies": {
"@types/node": "22.13.5",
"@vitest/coverage-v8": "3.0.6",
"@types/node": "22.13.9",
"@vitest/coverage-v8": "3.0.7",
"eslint": "9.21.0",
"eslint-plugin-jsdoc": "50.6.3",
"hyparquet-compressors": "1.0.0",
"typescript": "5.7.3",
"typescript-eslint": "8.24.1",
"vitest": "3.0.6"
"typescript-eslint": "8.26.0",
"vitest": "3.0.7"
}
}

@ -28,8 +28,7 @@ import { deserializeTCompactProtocol } from './thrift.js'
* @returns {Promise<FileMetaData>} parquet metadata object
*/
export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 << 19 /* 512kb */) {
if (!asyncBuffer) throw new Error('parquet file is required')
if (!(asyncBuffer.byteLength >= 0)) throw new Error('parquet file byteLength is required')
if (!asyncBuffer || !(asyncBuffer.byteLength >= 0)) throw new Error('parquetMetadataAsync expected AsyncBuffer')
// fetch last bytes (footer) of the file
const footerOffset = Math.max(0, asyncBuffer.byteLength - initialFetchSize)
@ -68,11 +67,11 @@ export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 <<
/**
* Read parquet metadata from a buffer synchronously.
*
* @param {ArrayBuffer} arrayBuffer parquet file contents
* @param {ArrayBuffer} arrayBuffer parquet file footer
* @returns {FileMetaData} parquet metadata object
*/
export function parquetMetadata(arrayBuffer) {
if (!arrayBuffer) throw new Error('parquet file is required')
if (!(arrayBuffer instanceof ArrayBuffer)) throw new Error('parquetMetadata expected ArrayBuffer')
const view = new DataView(arrayBuffer)
// Validate footer magic number "PAR1"

@ -14,6 +14,9 @@ import { equals } from './utils.js'
*/
export async function parquetQuery(options) {
const { file, rowStart, rowEnd, orderBy, filter } = options
if (!file || !(file.byteLength >= 0)) {
throw new Error('parquetQuery expected file AsyncBuffer')
}
options.metadata ||= await parquetMetadataAsync(file)
// TODO: Faster path for: no orderBy, no rowStart/rowEnd, one row group

@ -16,10 +16,12 @@ import { concat } from './utils.js'
* the chunks.
*
* @param {ParquetReadOptions} options read options
* @returns {Promise<void>} resolves when all requested rows and columns are parsed
* @returns {Promise<void>} resolves when all requested rows and columns are parsed, all errors are thrown here
*/
export async function parquetRead(options) {
if (!options.file) throw new Error('parquet file is required')
if (!options.file || !(options.file.byteLength >= 0)) {
throw new Error('parquetRead expected file AsyncBuffer')
}
// load metadata if not provided
options.metadata ||= await parquetMetadataAsync(options.file)

@ -20,7 +20,7 @@ describe('parquetMetadata', () => {
it('throws for arrayBuffer undefined', () => {
// @ts-expect-error testing invalid input
expect(() => parquetMetadata(undefined)).toThrow('parquet file is required')
expect(() => parquetMetadata(undefined)).toThrow('parquetMetadata expected ArrayBuffer')
})
it('throws for a too short file', () => {
@ -66,7 +66,7 @@ describe('parquetMetadataAsync', () => {
const arrayBuffer = undefined
// @ts-expect-error testing invalid input
await expect(parquetMetadataAsync(arrayBuffer)).rejects
.toThrow('parquet file is required')
.toThrow('parquetMetadataAsync expected AsyncBuffer')
})
it('throws for invalid magic number', async () => {

@ -6,7 +6,7 @@ describe('parquetQuery', () => {
it('throws error for undefined file', async () => {
// @ts-expect-error testing invalid input
await expect(parquetQuery({ file: undefined }))
.rejects.toThrow('parquet file is required')
.rejects.toThrow('parquetQuery expected file AsyncBuffer')
})
it('reads data without orderBy', async () => {

@ -6,14 +6,14 @@ describe('parquetRead', () => {
it('throws error for undefined file', async () => {
// @ts-expect-error testing invalid input
await expect(parquetRead({ file: undefined }))
.rejects.toThrow('parquet file is required')
.rejects.toThrow('parquetRead expected file AsyncBuffer')
})
it('throws error for undefined byteLength', async () => {
const file = { byteLength: undefined, slice: () => new ArrayBuffer(0) }
// @ts-expect-error testing invalid input
await expect(parquetRead({ file }))
.rejects.toThrow('parquet file byteLength is required')
.rejects.toThrow('parquetRead expected file AsyncBuffer')
})
it('filter by row', async () => {