From 76dc980e8a09b8d3726ff267af43d247172499a9 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Thu, 20 Mar 2025 00:04:21 -0700 Subject: [PATCH] Update README --- README.md | 35 ++++++++++++--------- src/brotli.bitreader.js | 2 +- src/{brotliContext.js => brotli.context.js} | 0 src/brotli.js | 4 +-- src/{brotliStreams.js => brotli.streams.js} | 0 5 files changed, 24 insertions(+), 17 deletions(-) rename src/{brotliContext.js => brotli.context.js} (100%) rename src/{brotliStreams.js => brotli.streams.js} (100%) diff --git a/README.md b/README.md index bd16e7a..a8f64ce 100644 --- a/README.md +++ b/README.md @@ -8,26 +8,33 @@ [![mit license](https://img.shields.io/badge/License-MIT-orange.svg)](https://opensource.org/licenses/MIT) ![coverage](https://img.shields.io/badge/Coverage-86-darkred) -This package exports a `compressors` object intended to be passed into [hyparquet](https://github.com/hyparam/hyparquet). +This package provides decompressors for various compression codecs. +It is designed to be used with [hyparquet](https://github.com/hyparam/hyparquet) in order to provide full support for all parquet compression formats. + +## Introduction [Apache Parquet](https://parquet.apache.org) is a popular columnar storage format that is widely used in data engineering, data science, and machine learning applications for efficiently storing and processing large datasets. It supports a number of different compression formats, but most parquet files use snappy compression. -The hyparquet library by default only supports `uncompressed` and `snappy` compressed files. The `hyparquet-compressors` package extends support for all legal parquet compression formats. +[Hyparquet](https://github.com/hyparam/hyparquet) is a fast and lightweight parquet reader that is designed to work in both node.js and the browser. -The `hyparquet-compressors` package works in both node.js and the browser. Uses js and wasm packages, no system dependencies. +By default, hyparquet only supports `uncompressed` and `snappy` compressed files (the most common parquet compression codecs). The `hyparquet-compressors` package extends support for all legal parquet compression formats. -## Usage +`hyparquet-compressors` works in both node.js and the browser. Uses js and wasm packages, no system dependencies. + +## Hyparquet + +To use `hyparquet-compressors` with `hyparquet`, simply pass the `compressors` object to the `parquetReadObjects` function. ```js -import { parquetRead } from 'hyparquet' +import { parquetReadObjects } from 'hyparquet' import { compressors } from 'hyparquet-compressors' -await parquetRead({ file, compressors, onComplete: console.log }) +const data = await parquetReadObjects({ file, compressors }) ``` See [hyparquet](https://github.com/hyparam/hyparquet) repo for more info. -# Compression formats +## Compression formats Parquet compression types supported with `hyparquet-compressors`: - [X] Uncompressed @@ -39,35 +46,35 @@ Parquet compression types supported with `hyparquet-compressors`: - [X] ZSTD - [X] LZ4_RAW -## Snappy +### Snappy Snappy compression uses [hysnappy](https://github.com/hyparam/hysnappy) for fast snappy decompression using minimal wasm. -## Gzip +### Gzip New gzip implementation adapted from [fflate](https://github.com/101arrowz/fflate). Includes modifications to handle repeated back-to-back gzip streams that sometimes occur in parquet files (but was not supported by fflate). -## Brotli +### Brotli Includes a minimal port of [brotli.js](https://github.com/foliojs/brotli.js) which pre-compresses the brotli dictionary using gzip to minimize the distribution bundle size. -## LZ4 +### LZ4 New LZ4 implementation includes support for legacy hadoop LZ4 frame format used on some old parquet files. -## Zstd +### Zstd Uses [fzstd](https://github.com/101arrowz/fzstd) for Zstandard decompression. -# Bundle size +## Bundle size | File | Size | | --- | --- | | hyparquet-compressors.min.js | 116.1kb | | hyparquet-compressors.min.js.gz | 75.2kb | -# References +## References - https://parquet.apache.org/docs/file-format/data-pages/compression/ - https://en.wikipedia.org/wiki/Brotli diff --git a/src/brotli.bitreader.js b/src/brotli.bitreader.js index b36346c..1049aa8 100644 --- a/src/brotli.bitreader.js +++ b/src/brotli.bitreader.js @@ -28,7 +28,7 @@ const kBitMask = new Uint32Array([ * Input byte buffer, consist of a ringbuffer and a "slack" region where * bytes from the start of the ringbuffer are copied. * - * @typedef {import('./brotliStreams.js').BrotliInput} BrotliInput + * @typedef {import('./brotli.streams.js').BrotliInput} BrotliInput * @param {BrotliInput} input */ function BrotliBitReader(input) { diff --git a/src/brotliContext.js b/src/brotli.context.js similarity index 100% rename from src/brotliContext.js rename to src/brotli.context.js diff --git a/src/brotli.js b/src/brotli.js index 4f547e7..bb06176 100644 --- a/src/brotli.js +++ b/src/brotli.js @@ -4,10 +4,10 @@ */ import BrotliBitReader from './brotli.bitreader.js' -import { lookup, lookupOffsets } from './brotliContext.js' +import { lookup, lookupOffsets } from './brotli.context.js' import { HuffmanCode, readHuffmanCode, readSymbol } from './brotli.huffman.js' import { kBlockLengthPrefixCode, kCopyLengthPrefixCode, kCopyRangeLut, kInsertLengthPrefixCode, kInsertRangeLut } from './brotli.prefix.js' -import { BrotliInput, BrotliOutput } from './brotliStreams.js' +import { BrotliInput, BrotliOutput } from './brotli.streams.js' import { kNumTransforms, transformDictionaryWord } from './brotli.transform.js' const kNumLiteralCodes = 256 diff --git a/src/brotliStreams.js b/src/brotli.streams.js similarity index 100% rename from src/brotliStreams.js rename to src/brotli.streams.js