diff --git a/README.md b/README.md index 28b9ba1..75bf8bf 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,8 @@ Hyparquet aims to be the world's most compliant parquet parser. And it runs in t ## Why hyparquet? +Parquet is widely used in data engineering and data science for its efficient storage and processing of large datasets. What if you could use parquet files directly in the browser, without needing a server or backend infrastructure? That's what hyparquet enables. + Existing JavaScript-based parquet readers (like [parquetjs](https://github.com/ironSource/parquetjs)) are no longer actively maintained, may not support streaming or in-browser processing efficiently, and often rely on dependencies that can inflate your bundle size. Hyparquet is actively maintained and designed with modern web usage in mind. @@ -40,8 +42,8 @@ Hyparquet is actively maintained and designed with modern web usage in mind. Check out a minimal parquet viewer demo that shows how to integrate hyparquet into a react web application using [HighTable](https://github.com/hyparam/hightable). - - **Live Demo**: [https://hyparam.github.io/hyperparam-cli/apps/hyparquet-demo/](https://hyparam.github.io/hyperparam-cli/apps/hyparquet-demo/) - - **Source Code**: [https://github.com/hyparam/hyperparam-cli/tree/master/apps/hyparquet-demo](https://github.com/hyparam/hyperparam-cli/tree/master/apps/hyparquet-demo) + - **Live Demo**: [https://hyparam.github.io/demos/hyparquet/](https://hyparam.github.io/demos/hyparquet/) + - **Demo Source Code**: [https://github.com/hyparam/demos/tree/master/hyparquet](https://github.com/hyparam/demos/tree/master/hyparquet) ## Quick Start @@ -86,11 +88,17 @@ You can read just the metadata, including schema and data statistics using the ` To load parquet data in the browser from a remote server using `fetch`: ```javascript -import { parquetMetadata } from 'hyparquet' +import { parquetMetadata, parquetSchema } from 'hyparquet' const res = await fetch(url) const arrayBuffer = await res.arrayBuffer() const metadata = parquetMetadata(arrayBuffer) +// Get total number of rows (convert bigint to number) +const numRows = Number(metadata.num_rows) +// Get nested table schema +const schema = parquetSchema(metadata) +// Get top-level column header names +const columnNames = schema.children.map(e => e.element.name) ``` ### AsyncBuffer diff --git a/package.json b/package.json index c6aba24..fdd9592 100644 --- a/package.json +++ b/package.json @@ -28,17 +28,18 @@ "build:types": "tsc -p ./tsconfig.build.json", "coverage": "vitest run --coverage --coverage.include=src", "lint": "eslint", + "lint:fix": "eslint --fix", "prepare": "npm run build:types", "test": "vitest run" }, "devDependencies": { - "@types/node": "22.13.5", - "@vitest/coverage-v8": "3.0.6", + "@types/node": "22.13.9", + "@vitest/coverage-v8": "3.0.7", "eslint": "9.21.0", "eslint-plugin-jsdoc": "50.6.3", "hyparquet-compressors": "1.0.0", "typescript": "5.7.3", - "typescript-eslint": "8.24.1", - "vitest": "3.0.6" + "typescript-eslint": "8.26.0", + "vitest": "3.0.7" } } diff --git a/src/metadata.js b/src/metadata.js index 48d1f37..adefe61 100644 --- a/src/metadata.js +++ b/src/metadata.js @@ -28,8 +28,7 @@ import { deserializeTCompactProtocol } from './thrift.js' * @returns {Promise} parquet metadata object */ export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 << 19 /* 512kb */) { - if (!asyncBuffer) throw new Error('parquet file is required') - if (!(asyncBuffer.byteLength >= 0)) throw new Error('parquet file byteLength is required') + if (!asyncBuffer || !(asyncBuffer.byteLength >= 0)) throw new Error('parquetMetadataAsync expected AsyncBuffer') // fetch last bytes (footer) of the file const footerOffset = Math.max(0, asyncBuffer.byteLength - initialFetchSize) @@ -68,11 +67,11 @@ export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 << /** * Read parquet metadata from a buffer synchronously. * - * @param {ArrayBuffer} arrayBuffer parquet file contents + * @param {ArrayBuffer} arrayBuffer parquet file footer * @returns {FileMetaData} parquet metadata object */ export function parquetMetadata(arrayBuffer) { - if (!arrayBuffer) throw new Error('parquet file is required') + if (!(arrayBuffer instanceof ArrayBuffer)) throw new Error('parquetMetadata expected ArrayBuffer') const view = new DataView(arrayBuffer) // Validate footer magic number "PAR1" diff --git a/src/query.js b/src/query.js index b6d1622..72c6bdf 100644 --- a/src/query.js +++ b/src/query.js @@ -14,6 +14,9 @@ import { equals } from './utils.js' */ export async function parquetQuery(options) { const { file, rowStart, rowEnd, orderBy, filter } = options + if (!file || !(file.byteLength >= 0)) { + throw new Error('parquetQuery expected file AsyncBuffer') + } options.metadata ||= await parquetMetadataAsync(file) // TODO: Faster path for: no orderBy, no rowStart/rowEnd, one row group diff --git a/src/read.js b/src/read.js index c61d490..78d68ed 100644 --- a/src/read.js +++ b/src/read.js @@ -16,10 +16,12 @@ import { concat } from './utils.js' * the chunks. * * @param {ParquetReadOptions} options read options - * @returns {Promise} resolves when all requested rows and columns are parsed + * @returns {Promise} resolves when all requested rows and columns are parsed, all errors are thrown here */ export async function parquetRead(options) { - if (!options.file) throw new Error('parquet file is required') + if (!options.file || !(options.file.byteLength >= 0)) { + throw new Error('parquetRead expected file AsyncBuffer') + } // load metadata if not provided options.metadata ||= await parquetMetadataAsync(options.file) diff --git a/test/metadata.test.js b/test/metadata.test.js index 56049a9..03263ac 100644 --- a/test/metadata.test.js +++ b/test/metadata.test.js @@ -20,7 +20,7 @@ describe('parquetMetadata', () => { it('throws for arrayBuffer undefined', () => { // @ts-expect-error testing invalid input - expect(() => parquetMetadata(undefined)).toThrow('parquet file is required') + expect(() => parquetMetadata(undefined)).toThrow('parquetMetadata expected ArrayBuffer') }) it('throws for a too short file', () => { @@ -66,7 +66,7 @@ describe('parquetMetadataAsync', () => { const arrayBuffer = undefined // @ts-expect-error testing invalid input await expect(parquetMetadataAsync(arrayBuffer)).rejects - .toThrow('parquet file is required') + .toThrow('parquetMetadataAsync expected AsyncBuffer') }) it('throws for invalid magic number', async () => { diff --git a/test/query.test.js b/test/query.test.js index d5e50cd..4393460 100644 --- a/test/query.test.js +++ b/test/query.test.js @@ -6,7 +6,7 @@ describe('parquetQuery', () => { it('throws error for undefined file', async () => { // @ts-expect-error testing invalid input await expect(parquetQuery({ file: undefined })) - .rejects.toThrow('parquet file is required') + .rejects.toThrow('parquetQuery expected file AsyncBuffer') }) it('reads data without orderBy', async () => { diff --git a/test/read.test.js b/test/read.test.js index fa091c6..ade5c50 100644 --- a/test/read.test.js +++ b/test/read.test.js @@ -6,14 +6,14 @@ describe('parquetRead', () => { it('throws error for undefined file', async () => { // @ts-expect-error testing invalid input await expect(parquetRead({ file: undefined })) - .rejects.toThrow('parquet file is required') + .rejects.toThrow('parquetRead expected file AsyncBuffer') }) it('throws error for undefined byteLength', async () => { const file = { byteLength: undefined, slice: () => new ArrayBuffer(0) } // @ts-expect-error testing invalid input await expect(parquetRead({ file })) - .rejects.toThrow('parquet file byteLength is required') + .rejects.toThrow('parquetRead expected file AsyncBuffer') }) it('filter by row', async () => {