From 58aed8d9cd871edb55b7344031096bbc2f9be763 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Tue, 9 Jan 2024 15:15:08 -0800 Subject: [PATCH] Update README --- README.md | 34 ++++++++++++++++++++++++++++++++-- src/encoding.js | 16 +++++++++++----- test/encoding.test.js | 2 +- 3 files changed, 44 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index c2b3fd7..a624806 100644 --- a/README.md +++ b/README.md @@ -12,18 +12,48 @@ Apache Parquet is an open source, column-oriented data file format designed for Dependency free since 2023! -## Usage +## Features + +- Designed to work with huge ML datasets (things like [starcoder](https://huggingface.co/datasets/bigcode/starcoderdata)) +- Loads metadata separately from data +- Data can be filtered by row and column ranges +- Only fetches the data needed +- Fast data loading for large scale ML applications +- Bring data visualization closer to the user, in the browser + +## Installation ```bash npm install hyparquet ``` +## Usage + +If you're in a node.js environment, you can load a parquet file with the following example: + +```js +const { parquetMetadata } = await import('hyparquet') +const fs = await import('fs') + +const buffer = fs.readFileSync('example.parquet') +const arrayBuffer = buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength) +const metadata = parquetMetadata(arrayBuffer) +``` + +If you're in a browser environment, you'll probably get parquet file data from either a drag-and-dropped file from the user, or downloaded from the web. + +To load parquet data in the browser from a remote server using `fetch`: + ```js import { parquetMetadata } from 'hyparquet' -const metadata = parquetMetdata(arrayBuffer) +const res = await fetch(url) +const arrayBuffer = await res.arrayBuffer() +const metadata = parquetMetadata(arrayBuffer) ``` +To parse parquet files from a user drag-and-drop action, see example in [index.html](index.html). + ## References - https://github.com/apache/parquet-format diff --git a/src/encoding.js b/src/encoding.js index 8e7ca49..371b5ac 100644 --- a/src/encoding.js +++ b/src/encoding.js @@ -308,7 +308,8 @@ function readRle(dataView, offset, header, bitWidth) { * @returns {Decoded} array of bit-packed values */ function readBitPacked(dataView, offset, header, bitWidth, remaining) { - let count = (header >> 1) * 8 + // extract number of values to read from header + let count = (header >> 1) << 3 const mask = maskForBits(bitWidth) let data = dataView.getUint8(offset) @@ -318,20 +319,24 @@ function readBitPacked(dataView, offset, header, bitWidth, remaining) { /** @type {number[]} */ const value = [] + // read values while (count) { + // if we have crossed a byte boundary, shift the data if (right > 8) { right -= 8 left -= 8 data >>= 8 } else if (left - right < bitWidth) { - // read next byte - data |= (dataView.getUint8(offset + byteLength) << left) + // if we don't have bitWidth number of bits to read, read next byte + data |= dataView.getUint8(offset + byteLength) << left byteLength++ left += 8 } else { - // don't write more than num rows + // otherwise, read bitWidth number of bits + // don't write more than remaining number of rows + // even if there are still bits to read if (remaining > 0) { - // emit value + // emit value by shifting off to the right and masking value.push((data >> right) & mask) remaining-- } @@ -340,6 +345,7 @@ function readBitPacked(dataView, offset, header, bitWidth, remaining) { } } + // return values and number of bytes read return { value, byteLength } } diff --git a/test/encoding.test.js b/test/encoding.test.js index 8a15644..603a754 100644 --- a/test/encoding.test.js +++ b/test/encoding.test.js @@ -26,7 +26,7 @@ describe('readPlain', () => { }) it('reads INT96 values correctly', () => { - const buffer = new ArrayBuffer(12) // 12 bytes for a single INT96 value + const buffer = new ArrayBuffer(12) const dataView = new DataView(buffer) // Example INT96 value split into 64-bit low part and 32-bit high part