From e2dae829b24ad21b82135b3805d433950ab7ebd0 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Sat, 25 May 2024 17:52:32 -0700 Subject: [PATCH] Update README and dependencies --- README.md | 32 +++++++++++++++++++++++++++++-- package.json | 6 +++--- rollup.config.js | 2 +- test/files/brotli_compressed.json | 8 ++++---- test/files/lz4_compressed.json | 8 ++++---- 5 files changed, 42 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 921898d..e069ddd 100644 --- a/README.md +++ b/README.md @@ -24,18 +24,46 @@ await parquetRead({ file, compressors, onComplete: console.log }) See [hyparquet](https://github.com/hyparam/hyparquet) repo for further info. -# Supported compression formats +# Compression formats Parquet compression types supported with `hyparquet-compressors`: - [X] Uncompressed - [X] Snappy - - [x] GZip + - [x] Gzip - [ ] LZO - [X] Brotli - [X] LZ4 - [X] ZSTD - [X] LZ4_RAW +## Snappy + +Snappy compression uses [hysnappy](https://github.com/hyparam/hysnappy) for fast snappy decompression using minimal wasm. + +## Gzip + +New gzip implementation adapted from [fflate](https://github.com/101arrowz/fflate). +Includes modifications to handle repeated back-to-back gzip streams that sometimes occur in parquet files, but was not supported by fflate. + +## Brotli + +Uses [brotli.js](https://github.com/foliojs/brotli.js) for brotli decompression. + +## LZ4 + +New LZ4 implementation includes support for legacy hadoop LZ4 frame format used on some old parquet files. + +## Zstd + +Uses [fzstd](https://github.com/101arrowz/fzstd) for Zstandard decompression. + +# Bundle size + +| File | Size | +| - | - | +| hyparquet-compressors.min.js | 502.1kb | +| hyparquet-compressors.min.js.gz | 102.2kb | + # References - https://parquet.apache.org/docs/file-format/data-pages/compression/ diff --git a/package.json b/package.json index 14b4b0d..d9a3c0d 100644 --- a/package.json +++ b/package.json @@ -37,7 +37,7 @@ "hysnappy": "0.3.1" }, "devDependencies": { - "@babel/eslint-parser": "7.24.5", + "@babel/eslint-parser": "7.24.6", "@rollup/plugin-commonjs": "25.0.8", "@rollup/plugin-node-resolve": "15.2.3", "@rollup/plugin-terser": "0.4.4", @@ -46,8 +46,8 @@ "@vitest/coverage-v8": "1.6.0", "eslint": "8.57.0", "eslint-plugin-import": "2.29.1", - "eslint-plugin-jsdoc": "48.2.5", - "hyparquet": "0.9.5", + "eslint-plugin-jsdoc": "48.2.6", + "hyparquet": "0.9.6", "rollup": "4.18.0", "typescript": "5.4.5", "vitest": "1.6.0" diff --git a/rollup.config.js b/rollup.config.js index 0c91b55..22a619d 100644 --- a/rollup.config.js +++ b/rollup.config.js @@ -5,7 +5,7 @@ import terser from '@rollup/plugin-terser' export default { input: 'src/index.js', output: { - file: 'dist/bundle.min.js', + file: 'dist/hyparquet-compressors.min.js', format: 'esm', sourcemap: true, }, diff --git a/test/files/brotli_compressed.json b/test/files/brotli_compressed.json index 9956d35..b3b0100 100644 --- a/test/files/brotli_compressed.json +++ b/test/files/brotli_compressed.json @@ -1,22 +1,22 @@ [ [ 1593604800, - [97, 98, 99], + "abc", 42 ], [ 1593604800, - [100, 101, 102], + "def", 7.7 ], [ 1593604801, - [97, 98, 99], + "abc", 42.125 ], [ 1593604801, - [100, 101, 102], + "def", 7.7 ] ] diff --git a/test/files/lz4_compressed.json b/test/files/lz4_compressed.json index 9956d35..b3b0100 100644 --- a/test/files/lz4_compressed.json +++ b/test/files/lz4_compressed.json @@ -1,22 +1,22 @@ [ [ 1593604800, - [97, 98, 99], + "abc", 42 ], [ 1593604800, - [100, 101, 102], + "def", 7.7 ], [ 1593604801, - [97, 98, 99], + "abc", 42.125 ], [ 1593604801, - [100, 101, 102], + "def", 7.7 ] ]