Resizable gzip output buffer

This commit is contained in:
Kenny Daniel 2025-03-20 00:16:08 -07:00
parent 76dc980e8a
commit 800223441c
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
3 changed files with 39 additions and 7 deletions

@ -48,16 +48,23 @@ Parquet compression types supported with `hyparquet-compressors`:
### Snappy
Snappy compression uses [hysnappy](https://github.com/hyparam/hysnappy) for fast snappy decompression using minimal wasm.
Snappy compression uses [hysnappy](https://github.com/hyparam/hysnappy) for fast snappy decompression using a minimal [WASM](https://en.wikipedia.org/wiki/WebAssembly) module.
We load the wasm module _synchronously_ from base64 in the js file. This avoids a network request, and greatly simplifies bundling and serving wasm.
### Gzip
New gzip implementation adapted from [fflate](https://github.com/101arrowz/fflate).
Includes modifications to handle repeated back-to-back gzip streams that sometimes occur in parquet files (but was not supported by fflate).
Includes modifications to handle repeated back-to-back gzip streams that sometimes occur in parquet files (but are not supported by fflate).
For gzip, the `output` buffer argument is optional:
- If `output` is defined, the decompressor will write to `output` until it is full.
- If `output` is undefined, the decompressor will allocate a new buffer, and expand it as needed to fit the uncompressed gzip data. Importantly, the caller should use the _returned_ buffer.
### Brotli
Includes a minimal port of [brotli.js](https://github.com/foliojs/brotli.js) which pre-compresses the brotli dictionary using gzip to minimize the distribution bundle size.
Includes a minimal port of [brotli.js](https://github.com/foliojs/brotli.js).
Our implementation uses gzip to pre-compress the brotli dictionary, in order to minimize the bundle size.
### LZ4

@ -68,14 +68,16 @@ function gzipStart(input, i) {
/**
* GZip decompression
* @param {Uint8Array} input
* @param {Uint8Array} out
* @param {Uint8Array} [output]
* @param {number} [inputIndex]
* @param {number} [outputIndex]
* @returns {Uint8Array}
*/
export function gunzip(input, out, inputIndex = 0, outputIndex = 0) {
if (!(input.length - inputIndex)) return
export function gunzip(input, output, inputIndex = 0, outputIndex = 0) {
let out = output ?? new Uint8Array(1024) // initial size
if (!(input.length - inputIndex)) return out
const payloadStart = gzipStart(input, inputIndex)
if (payloadStart === input.length - 8) return
if (payloadStart === input.length - 8) return out
if (payloadStart > input.length - 8) throw new Error('unexpected EOF')
let pos = payloadStart * 8 // position in bits
let final = 0 // last chunk?
@ -84,6 +86,16 @@ export function gunzip(input, out, inputIndex = 0, outputIndex = 0) {
let lengthMap
let distMap
const totalBits = input.length * 8
/** @param {number} length */
function ensureSize(length) {
if (!output && length > out.length) {
const old = out
out = new Uint8Array(Math.max(old.length * 2, length))
out.set(old)
}
}
do {
if (!lengthMap) {
// final chunk is next?
@ -98,6 +110,7 @@ export function gunzip(input, out, inputIndex = 0, outputIndex = 0) {
const t = s + l
if (t > input.length) throw new Error('unexpected EOF')
// copy uncompressed data
ensureSize(outputIndex + l)
out.set(input.subarray(s, t), outputIndex)
outputIndex += l
pos = t * 8
@ -160,6 +173,8 @@ export function gunzip(input, out, inputIndex = 0, outputIndex = 0) {
} else throw new Error('invalid block type')
if (pos > totalBits) throw new Error('unexpected EOF')
}
ensureSize(outputIndex + 131072) // max chunk size?
const lms = (1 << lengthBits) - 1
const dms = (1 << distBits) - 1
let lpos = pos
@ -199,6 +214,7 @@ export function gunzip(input, out, inputIndex = 0, outputIndex = 0) {
if (pos > totalBits) throw new Error('unexpected EOF')
const end = outputIndex + add
if (outputIndex < dt) throw new Error('unexpected dictionary case')
ensureSize(end)
for (; outputIndex < end; outputIndex++) out[outputIndex] = out[outputIndex - dt]
}
}
@ -211,4 +227,7 @@ export function gunzip(input, out, inputIndex = 0, outputIndex = 0) {
const nextBlock = Math.ceil(pos / 8) + 8 // 8 byte gzip footer
gunzip(input, out, nextBlock, outputIndex)
}
if (!output) return out.subarray(0, outputIndex)
return out
}

@ -54,4 +54,10 @@ describe('gzip compressor', () => {
expect(toJson(data)).toEqual(JSON.parse(expected))
} })
})
it('read gzip with unknown length', () => {
const input = new Uint8Array([31, 139, 8, 0, 77, 204, 77, 102, 0, 3, 227, 230, 22, 83, 4, 0, 117, 18, 225, 170, 4, 0, 0, 0])
const resized = gunzip(input)
expect(resized).toEqual(new Uint8Array([11, 11, 22, 33]))
})
})