mirror of
https://github.com/asadbek064/hyparquet-compressors.git
synced 2026-01-11 21:26:38 +00:00
Update README
This commit is contained in:
parent
dd77d556a4
commit
76dc980e8a
35
README.md
35
README.md
@ -8,26 +8,33 @@
|
||||
[](https://opensource.org/licenses/MIT)
|
||||

|
||||
|
||||
This package exports a `compressors` object intended to be passed into [hyparquet](https://github.com/hyparam/hyparquet).
|
||||
This package provides decompressors for various compression codecs.
|
||||
It is designed to be used with [hyparquet](https://github.com/hyparam/hyparquet) in order to provide full support for all parquet compression formats.
|
||||
|
||||
## Introduction
|
||||
|
||||
[Apache Parquet](https://parquet.apache.org) is a popular columnar storage format that is widely used in data engineering, data science, and machine learning applications for efficiently storing and processing large datasets. It supports a number of different compression formats, but most parquet files use snappy compression.
|
||||
|
||||
The hyparquet library by default only supports `uncompressed` and `snappy` compressed files. The `hyparquet-compressors` package extends support for all legal parquet compression formats.
|
||||
[Hyparquet](https://github.com/hyparam/hyparquet) is a fast and lightweight parquet reader that is designed to work in both node.js and the browser.
|
||||
|
||||
The `hyparquet-compressors` package works in both node.js and the browser. Uses js and wasm packages, no system dependencies.
|
||||
By default, hyparquet only supports `uncompressed` and `snappy` compressed files (the most common parquet compression codecs). The `hyparquet-compressors` package extends support for all legal parquet compression formats.
|
||||
|
||||
## Usage
|
||||
`hyparquet-compressors` works in both node.js and the browser. Uses js and wasm packages, no system dependencies.
|
||||
|
||||
## Hyparquet
|
||||
|
||||
To use `hyparquet-compressors` with `hyparquet`, simply pass the `compressors` object to the `parquetReadObjects` function.
|
||||
|
||||
```js
|
||||
import { parquetRead } from 'hyparquet'
|
||||
import { parquetReadObjects } from 'hyparquet'
|
||||
import { compressors } from 'hyparquet-compressors'
|
||||
|
||||
await parquetRead({ file, compressors, onComplete: console.log })
|
||||
const data = await parquetReadObjects({ file, compressors })
|
||||
```
|
||||
|
||||
See [hyparquet](https://github.com/hyparam/hyparquet) repo for more info.
|
||||
|
||||
# Compression formats
|
||||
## Compression formats
|
||||
|
||||
Parquet compression types supported with `hyparquet-compressors`:
|
||||
- [X] Uncompressed
|
||||
@ -39,35 +46,35 @@ Parquet compression types supported with `hyparquet-compressors`:
|
||||
- [X] ZSTD
|
||||
- [X] LZ4_RAW
|
||||
|
||||
## Snappy
|
||||
### Snappy
|
||||
|
||||
Snappy compression uses [hysnappy](https://github.com/hyparam/hysnappy) for fast snappy decompression using minimal wasm.
|
||||
|
||||
## Gzip
|
||||
### Gzip
|
||||
|
||||
New gzip implementation adapted from [fflate](https://github.com/101arrowz/fflate).
|
||||
Includes modifications to handle repeated back-to-back gzip streams that sometimes occur in parquet files (but was not supported by fflate).
|
||||
|
||||
## Brotli
|
||||
### Brotli
|
||||
|
||||
Includes a minimal port of [brotli.js](https://github.com/foliojs/brotli.js) which pre-compresses the brotli dictionary using gzip to minimize the distribution bundle size.
|
||||
|
||||
## LZ4
|
||||
### LZ4
|
||||
|
||||
New LZ4 implementation includes support for legacy hadoop LZ4 frame format used on some old parquet files.
|
||||
|
||||
## Zstd
|
||||
### Zstd
|
||||
|
||||
Uses [fzstd](https://github.com/101arrowz/fzstd) for Zstandard decompression.
|
||||
|
||||
# Bundle size
|
||||
## Bundle size
|
||||
|
||||
| File | Size |
|
||||
| --- | --- |
|
||||
| hyparquet-compressors.min.js | 116.1kb |
|
||||
| hyparquet-compressors.min.js.gz | 75.2kb |
|
||||
|
||||
# References
|
||||
## References
|
||||
|
||||
- https://parquet.apache.org/docs/file-format/data-pages/compression/
|
||||
- https://en.wikipedia.org/wiki/Brotli
|
||||
|
||||
@ -28,7 +28,7 @@ const kBitMask = new Uint32Array([
|
||||
* Input byte buffer, consist of a ringbuffer and a "slack" region where
|
||||
* bytes from the start of the ringbuffer are copied.
|
||||
*
|
||||
* @typedef {import('./brotliStreams.js').BrotliInput} BrotliInput
|
||||
* @typedef {import('./brotli.streams.js').BrotliInput} BrotliInput
|
||||
* @param {BrotliInput} input
|
||||
*/
|
||||
function BrotliBitReader(input) {
|
||||
|
||||
@ -4,10 +4,10 @@
|
||||
*/
|
||||
|
||||
import BrotliBitReader from './brotli.bitreader.js'
|
||||
import { lookup, lookupOffsets } from './brotliContext.js'
|
||||
import { lookup, lookupOffsets } from './brotli.context.js'
|
||||
import { HuffmanCode, readHuffmanCode, readSymbol } from './brotli.huffman.js'
|
||||
import { kBlockLengthPrefixCode, kCopyLengthPrefixCode, kCopyRangeLut, kInsertLengthPrefixCode, kInsertRangeLut } from './brotli.prefix.js'
|
||||
import { BrotliInput, BrotliOutput } from './brotliStreams.js'
|
||||
import { BrotliInput, BrotliOutput } from './brotli.streams.js'
|
||||
import { kNumTransforms, transformDictionaryWord } from './brotli.transform.js'
|
||||
|
||||
const kNumLiteralCodes = 256
|
||||
|
||||
Loading…
Reference in New Issue
Block a user