mirror of
https://github.com/asadbek064/hyparquet.git
synced 2025-12-26 15:16:38 +00:00
Better error messages
This commit is contained in:
parent
c70d5c2732
commit
2456cdc85f
14
README.md
14
README.md
@ -33,6 +33,8 @@ Hyparquet aims to be the world's most compliant parquet parser. And it runs in t
|
||||
|
||||
## Why hyparquet?
|
||||
|
||||
Parquet is widely used in data engineering and data science for its efficient storage and processing of large datasets. What if you could use parquet files directly in the browser, without needing a server or backend infrastructure? That's what hyparquet enables.
|
||||
|
||||
Existing JavaScript-based parquet readers (like [parquetjs](https://github.com/ironSource/parquetjs)) are no longer actively maintained, may not support streaming or in-browser processing efficiently, and often rely on dependencies that can inflate your bundle size.
|
||||
Hyparquet is actively maintained and designed with modern web usage in mind.
|
||||
|
||||
@ -40,8 +42,8 @@ Hyparquet is actively maintained and designed with modern web usage in mind.
|
||||
|
||||
Check out a minimal parquet viewer demo that shows how to integrate hyparquet into a react web application using [HighTable](https://github.com/hyparam/hightable).
|
||||
|
||||
- **Live Demo**: [https://hyparam.github.io/hyperparam-cli/apps/hyparquet-demo/](https://hyparam.github.io/hyperparam-cli/apps/hyparquet-demo/)
|
||||
- **Source Code**: [https://github.com/hyparam/hyperparam-cli/tree/master/apps/hyparquet-demo](https://github.com/hyparam/hyperparam-cli/tree/master/apps/hyparquet-demo)
|
||||
- **Live Demo**: [https://hyparam.github.io/demos/hyparquet/](https://hyparam.github.io/demos/hyparquet/)
|
||||
- **Demo Source Code**: [https://github.com/hyparam/demos/tree/master/hyparquet](https://github.com/hyparam/demos/tree/master/hyparquet)
|
||||
|
||||
## Quick Start
|
||||
|
||||
@ -86,11 +88,17 @@ You can read just the metadata, including schema and data statistics using the `
|
||||
To load parquet data in the browser from a remote server using `fetch`:
|
||||
|
||||
```javascript
|
||||
import { parquetMetadata } from 'hyparquet'
|
||||
import { parquetMetadata, parquetSchema } from 'hyparquet'
|
||||
|
||||
const res = await fetch(url)
|
||||
const arrayBuffer = await res.arrayBuffer()
|
||||
const metadata = parquetMetadata(arrayBuffer)
|
||||
// Get total number of rows (convert bigint to number)
|
||||
const numRows = Number(metadata.num_rows)
|
||||
// Get nested table schema
|
||||
const schema = parquetSchema(metadata)
|
||||
// Get top-level column header names
|
||||
const columnNames = schema.children.map(e => e.element.name)
|
||||
```
|
||||
|
||||
### AsyncBuffer
|
||||
|
||||
@ -28,17 +28,18 @@
|
||||
"build:types": "tsc -p ./tsconfig.build.json",
|
||||
"coverage": "vitest run --coverage --coverage.include=src",
|
||||
"lint": "eslint",
|
||||
"lint:fix": "eslint --fix",
|
||||
"prepare": "npm run build:types",
|
||||
"test": "vitest run"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "22.13.5",
|
||||
"@vitest/coverage-v8": "3.0.6",
|
||||
"@types/node": "22.13.9",
|
||||
"@vitest/coverage-v8": "3.0.7",
|
||||
"eslint": "9.21.0",
|
||||
"eslint-plugin-jsdoc": "50.6.3",
|
||||
"hyparquet-compressors": "1.0.0",
|
||||
"typescript": "5.7.3",
|
||||
"typescript-eslint": "8.24.1",
|
||||
"vitest": "3.0.6"
|
||||
"typescript-eslint": "8.26.0",
|
||||
"vitest": "3.0.7"
|
||||
}
|
||||
}
|
||||
|
||||
@ -28,8 +28,7 @@ import { deserializeTCompactProtocol } from './thrift.js'
|
||||
* @returns {Promise<FileMetaData>} parquet metadata object
|
||||
*/
|
||||
export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 << 19 /* 512kb */) {
|
||||
if (!asyncBuffer) throw new Error('parquet file is required')
|
||||
if (!(asyncBuffer.byteLength >= 0)) throw new Error('parquet file byteLength is required')
|
||||
if (!asyncBuffer || !(asyncBuffer.byteLength >= 0)) throw new Error('parquetMetadataAsync expected AsyncBuffer')
|
||||
|
||||
// fetch last bytes (footer) of the file
|
||||
const footerOffset = Math.max(0, asyncBuffer.byteLength - initialFetchSize)
|
||||
@ -68,11 +67,11 @@ export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 <<
|
||||
/**
|
||||
* Read parquet metadata from a buffer synchronously.
|
||||
*
|
||||
* @param {ArrayBuffer} arrayBuffer parquet file contents
|
||||
* @param {ArrayBuffer} arrayBuffer parquet file footer
|
||||
* @returns {FileMetaData} parquet metadata object
|
||||
*/
|
||||
export function parquetMetadata(arrayBuffer) {
|
||||
if (!arrayBuffer) throw new Error('parquet file is required')
|
||||
if (!(arrayBuffer instanceof ArrayBuffer)) throw new Error('parquetMetadata expected ArrayBuffer')
|
||||
const view = new DataView(arrayBuffer)
|
||||
|
||||
// Validate footer magic number "PAR1"
|
||||
|
||||
@ -14,6 +14,9 @@ import { equals } from './utils.js'
|
||||
*/
|
||||
export async function parquetQuery(options) {
|
||||
const { file, rowStart, rowEnd, orderBy, filter } = options
|
||||
if (!file || !(file.byteLength >= 0)) {
|
||||
throw new Error('parquetQuery expected file AsyncBuffer')
|
||||
}
|
||||
options.metadata ||= await parquetMetadataAsync(file)
|
||||
|
||||
// TODO: Faster path for: no orderBy, no rowStart/rowEnd, one row group
|
||||
|
||||
@ -16,10 +16,12 @@ import { concat } from './utils.js'
|
||||
* the chunks.
|
||||
*
|
||||
* @param {ParquetReadOptions} options read options
|
||||
* @returns {Promise<void>} resolves when all requested rows and columns are parsed
|
||||
* @returns {Promise<void>} resolves when all requested rows and columns are parsed, all errors are thrown here
|
||||
*/
|
||||
export async function parquetRead(options) {
|
||||
if (!options.file) throw new Error('parquet file is required')
|
||||
if (!options.file || !(options.file.byteLength >= 0)) {
|
||||
throw new Error('parquetRead expected file AsyncBuffer')
|
||||
}
|
||||
|
||||
// load metadata if not provided
|
||||
options.metadata ||= await parquetMetadataAsync(options.file)
|
||||
|
||||
@ -20,7 +20,7 @@ describe('parquetMetadata', () => {
|
||||
|
||||
it('throws for arrayBuffer undefined', () => {
|
||||
// @ts-expect-error testing invalid input
|
||||
expect(() => parquetMetadata(undefined)).toThrow('parquet file is required')
|
||||
expect(() => parquetMetadata(undefined)).toThrow('parquetMetadata expected ArrayBuffer')
|
||||
})
|
||||
|
||||
it('throws for a too short file', () => {
|
||||
@ -66,7 +66,7 @@ describe('parquetMetadataAsync', () => {
|
||||
const arrayBuffer = undefined
|
||||
// @ts-expect-error testing invalid input
|
||||
await expect(parquetMetadataAsync(arrayBuffer)).rejects
|
||||
.toThrow('parquet file is required')
|
||||
.toThrow('parquetMetadataAsync expected AsyncBuffer')
|
||||
})
|
||||
|
||||
it('throws for invalid magic number', async () => {
|
||||
|
||||
@ -6,7 +6,7 @@ describe('parquetQuery', () => {
|
||||
it('throws error for undefined file', async () => {
|
||||
// @ts-expect-error testing invalid input
|
||||
await expect(parquetQuery({ file: undefined }))
|
||||
.rejects.toThrow('parquet file is required')
|
||||
.rejects.toThrow('parquetQuery expected file AsyncBuffer')
|
||||
})
|
||||
|
||||
it('reads data without orderBy', async () => {
|
||||
|
||||
@ -6,14 +6,14 @@ describe('parquetRead', () => {
|
||||
it('throws error for undefined file', async () => {
|
||||
// @ts-expect-error testing invalid input
|
||||
await expect(parquetRead({ file: undefined }))
|
||||
.rejects.toThrow('parquet file is required')
|
||||
.rejects.toThrow('parquetRead expected file AsyncBuffer')
|
||||
})
|
||||
|
||||
it('throws error for undefined byteLength', async () => {
|
||||
const file = { byteLength: undefined, slice: () => new ArrayBuffer(0) }
|
||||
// @ts-expect-error testing invalid input
|
||||
await expect(parquetRead({ file }))
|
||||
.rejects.toThrow('parquet file byteLength is required')
|
||||
.rejects.toThrow('parquetRead expected file AsyncBuffer')
|
||||
})
|
||||
|
||||
it('filter by row', async () => {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user