diff --git a/README.md b/README.md index a8f64ce..d0eef56 100644 --- a/README.md +++ b/README.md @@ -48,16 +48,23 @@ Parquet compression types supported with `hyparquet-compressors`: ### Snappy -Snappy compression uses [hysnappy](https://github.com/hyparam/hysnappy) for fast snappy decompression using minimal wasm. +Snappy compression uses [hysnappy](https://github.com/hyparam/hysnappy) for fast snappy decompression using a minimal [WASM](https://en.wikipedia.org/wiki/WebAssembly) module. + +We load the wasm module _synchronously_ from base64 in the js file. This avoids a network request, and greatly simplifies bundling and serving wasm. ### Gzip New gzip implementation adapted from [fflate](https://github.com/101arrowz/fflate). -Includes modifications to handle repeated back-to-back gzip streams that sometimes occur in parquet files (but was not supported by fflate). +Includes modifications to handle repeated back-to-back gzip streams that sometimes occur in parquet files (but are not supported by fflate). + +For gzip, the `output` buffer argument is optional: + - If `output` is defined, the decompressor will write to `output` until it is full. + - If `output` is undefined, the decompressor will allocate a new buffer, and expand it as needed to fit the uncompressed gzip data. Importantly, the caller should use the _returned_ buffer. ### Brotli -Includes a minimal port of [brotli.js](https://github.com/foliojs/brotli.js) which pre-compresses the brotli dictionary using gzip to minimize the distribution bundle size. +Includes a minimal port of [brotli.js](https://github.com/foliojs/brotli.js). +Our implementation uses gzip to pre-compress the brotli dictionary, in order to minimize the bundle size. ### LZ4 diff --git a/src/gzip.js b/src/gzip.js index f0f52ff..d881837 100644 --- a/src/gzip.js +++ b/src/gzip.js @@ -68,14 +68,16 @@ function gzipStart(input, i) { /** * GZip decompression * @param {Uint8Array} input - * @param {Uint8Array} out + * @param {Uint8Array} [output] * @param {number} [inputIndex] * @param {number} [outputIndex] + * @returns {Uint8Array} */ -export function gunzip(input, out, inputIndex = 0, outputIndex = 0) { - if (!(input.length - inputIndex)) return +export function gunzip(input, output, inputIndex = 0, outputIndex = 0) { + let out = output ?? new Uint8Array(1024) // initial size + if (!(input.length - inputIndex)) return out const payloadStart = gzipStart(input, inputIndex) - if (payloadStart === input.length - 8) return + if (payloadStart === input.length - 8) return out if (payloadStart > input.length - 8) throw new Error('unexpected EOF') let pos = payloadStart * 8 // position in bits let final = 0 // last chunk? @@ -84,6 +86,16 @@ export function gunzip(input, out, inputIndex = 0, outputIndex = 0) { let lengthMap let distMap const totalBits = input.length * 8 + + /** @param {number} length */ + function ensureSize(length) { + if (!output && length > out.length) { + const old = out + out = new Uint8Array(Math.max(old.length * 2, length)) + out.set(old) + } + } + do { if (!lengthMap) { // final chunk is next? @@ -98,6 +110,7 @@ export function gunzip(input, out, inputIndex = 0, outputIndex = 0) { const t = s + l if (t > input.length) throw new Error('unexpected EOF') // copy uncompressed data + ensureSize(outputIndex + l) out.set(input.subarray(s, t), outputIndex) outputIndex += l pos = t * 8 @@ -160,6 +173,8 @@ export function gunzip(input, out, inputIndex = 0, outputIndex = 0) { } else throw new Error('invalid block type') if (pos > totalBits) throw new Error('unexpected EOF') } + + ensureSize(outputIndex + 131072) // max chunk size? const lms = (1 << lengthBits) - 1 const dms = (1 << distBits) - 1 let lpos = pos @@ -199,6 +214,7 @@ export function gunzip(input, out, inputIndex = 0, outputIndex = 0) { if (pos > totalBits) throw new Error('unexpected EOF') const end = outputIndex + add if (outputIndex < dt) throw new Error('unexpected dictionary case') + ensureSize(end) for (; outputIndex < end; outputIndex++) out[outputIndex] = out[outputIndex - dt] } } @@ -211,4 +227,7 @@ export function gunzip(input, out, inputIndex = 0, outputIndex = 0) { const nextBlock = Math.ceil(pos / 8) + 8 // 8 byte gzip footer gunzip(input, out, nextBlock, outputIndex) } + + if (!output) return out.subarray(0, outputIndex) + return out } diff --git a/test/gzip.test.js b/test/gzip.test.js index 6b9b1bc..5e46ce6 100644 --- a/test/gzip.test.js +++ b/test/gzip.test.js @@ -54,4 +54,10 @@ describe('gzip compressor', () => { expect(toJson(data)).toEqual(JSON.parse(expected)) } }) }) + + it('read gzip with unknown length', () => { + const input = new Uint8Array([31, 139, 8, 0, 77, 204, 77, 102, 0, 3, 227, 230, 22, 83, 4, 0, 117, 18, 225, 170, 4, 0, 0, 0]) + const resized = gunzip(input) + expect(resized).toEqual(new Uint8Array([11, 11, 22, 33])) + }) })