mirror of
https://github.com/asadbek064/hyparquet.git
synced 2026-01-04 18:46:37 +00:00
Update README
This commit is contained in:
parent
271cc72db9
commit
58aed8d9cd
34
README.md
34
README.md
@ -12,18 +12,48 @@ Apache Parquet is an open source, column-oriented data file format designed for
|
||||
|
||||
Dependency free since 2023!
|
||||
|
||||
## Usage
|
||||
## Features
|
||||
|
||||
- Designed to work with huge ML datasets (things like [starcoder](https://huggingface.co/datasets/bigcode/starcoderdata))
|
||||
- Loads metadata separately from data
|
||||
- Data can be filtered by row and column ranges
|
||||
- Only fetches the data needed
|
||||
- Fast data loading for large scale ML applications
|
||||
- Bring data visualization closer to the user, in the browser
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
npm install hyparquet
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
If you're in a node.js environment, you can load a parquet file with the following example:
|
||||
|
||||
```js
|
||||
const { parquetMetadata } = await import('hyparquet')
|
||||
const fs = await import('fs')
|
||||
|
||||
const buffer = fs.readFileSync('example.parquet')
|
||||
const arrayBuffer = buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)
|
||||
const metadata = parquetMetadata(arrayBuffer)
|
||||
```
|
||||
|
||||
If you're in a browser environment, you'll probably get parquet file data from either a drag-and-dropped file from the user, or downloaded from the web.
|
||||
|
||||
To load parquet data in the browser from a remote server using `fetch`:
|
||||
|
||||
```js
|
||||
import { parquetMetadata } from 'hyparquet'
|
||||
|
||||
const metadata = parquetMetdata(arrayBuffer)
|
||||
const res = await fetch(url)
|
||||
const arrayBuffer = await res.arrayBuffer()
|
||||
const metadata = parquetMetadata(arrayBuffer)
|
||||
```
|
||||
|
||||
To parse parquet files from a user drag-and-drop action, see example in [index.html](index.html).
|
||||
|
||||
## References
|
||||
|
||||
- https://github.com/apache/parquet-format
|
||||
|
||||
@ -308,7 +308,8 @@ function readRle(dataView, offset, header, bitWidth) {
|
||||
* @returns {Decoded<number[]>} array of bit-packed values
|
||||
*/
|
||||
function readBitPacked(dataView, offset, header, bitWidth, remaining) {
|
||||
let count = (header >> 1) * 8
|
||||
// extract number of values to read from header
|
||||
let count = (header >> 1) << 3
|
||||
const mask = maskForBits(bitWidth)
|
||||
|
||||
let data = dataView.getUint8(offset)
|
||||
@ -318,20 +319,24 @@ function readBitPacked(dataView, offset, header, bitWidth, remaining) {
|
||||
/** @type {number[]} */
|
||||
const value = []
|
||||
|
||||
// read values
|
||||
while (count) {
|
||||
// if we have crossed a byte boundary, shift the data
|
||||
if (right > 8) {
|
||||
right -= 8
|
||||
left -= 8
|
||||
data >>= 8
|
||||
} else if (left - right < bitWidth) {
|
||||
// read next byte
|
||||
data |= (dataView.getUint8(offset + byteLength) << left)
|
||||
// if we don't have bitWidth number of bits to read, read next byte
|
||||
data |= dataView.getUint8(offset + byteLength) << left
|
||||
byteLength++
|
||||
left += 8
|
||||
} else {
|
||||
// don't write more than num rows
|
||||
// otherwise, read bitWidth number of bits
|
||||
// don't write more than remaining number of rows
|
||||
// even if there are still bits to read
|
||||
if (remaining > 0) {
|
||||
// emit value
|
||||
// emit value by shifting off to the right and masking
|
||||
value.push((data >> right) & mask)
|
||||
remaining--
|
||||
}
|
||||
@ -340,6 +345,7 @@ function readBitPacked(dataView, offset, header, bitWidth, remaining) {
|
||||
}
|
||||
}
|
||||
|
||||
// return values and number of bytes read
|
||||
return { value, byteLength }
|
||||
}
|
||||
|
||||
|
||||
@ -26,7 +26,7 @@ describe('readPlain', () => {
|
||||
})
|
||||
|
||||
it('reads INT96 values correctly', () => {
|
||||
const buffer = new ArrayBuffer(12) // 12 bytes for a single INT96 value
|
||||
const buffer = new ArrayBuffer(12)
|
||||
const dataView = new DataView(buffer)
|
||||
|
||||
// Example INT96 value split into 64-bit low part and 32-bit high part
|
||||
|
||||
Loading…
Reference in New Issue
Block a user