mirror of
https://github.com/asadbek064/hyparquet-compressors.git
synced 2026-01-11 21:26:38 +00:00
Resizable gzip output buffer
This commit is contained in:
parent
76dc980e8a
commit
800223441c
13
README.md
13
README.md
@ -48,16 +48,23 @@ Parquet compression types supported with `hyparquet-compressors`:
|
||||
|
||||
### Snappy
|
||||
|
||||
Snappy compression uses [hysnappy](https://github.com/hyparam/hysnappy) for fast snappy decompression using minimal wasm.
|
||||
Snappy compression uses [hysnappy](https://github.com/hyparam/hysnappy) for fast snappy decompression using a minimal [WASM](https://en.wikipedia.org/wiki/WebAssembly) module.
|
||||
|
||||
We load the wasm module _synchronously_ from base64 in the js file. This avoids a network request, and greatly simplifies bundling and serving wasm.
|
||||
|
||||
### Gzip
|
||||
|
||||
New gzip implementation adapted from [fflate](https://github.com/101arrowz/fflate).
|
||||
Includes modifications to handle repeated back-to-back gzip streams that sometimes occur in parquet files (but was not supported by fflate).
|
||||
Includes modifications to handle repeated back-to-back gzip streams that sometimes occur in parquet files (but are not supported by fflate).
|
||||
|
||||
For gzip, the `output` buffer argument is optional:
|
||||
- If `output` is defined, the decompressor will write to `output` until it is full.
|
||||
- If `output` is undefined, the decompressor will allocate a new buffer, and expand it as needed to fit the uncompressed gzip data. Importantly, the caller should use the _returned_ buffer.
|
||||
|
||||
### Brotli
|
||||
|
||||
Includes a minimal port of [brotli.js](https://github.com/foliojs/brotli.js) which pre-compresses the brotli dictionary using gzip to minimize the distribution bundle size.
|
||||
Includes a minimal port of [brotli.js](https://github.com/foliojs/brotli.js).
|
||||
Our implementation uses gzip to pre-compress the brotli dictionary, in order to minimize the bundle size.
|
||||
|
||||
### LZ4
|
||||
|
||||
|
||||
27
src/gzip.js
27
src/gzip.js
@ -68,14 +68,16 @@ function gzipStart(input, i) {
|
||||
/**
|
||||
* GZip decompression
|
||||
* @param {Uint8Array} input
|
||||
* @param {Uint8Array} out
|
||||
* @param {Uint8Array} [output]
|
||||
* @param {number} [inputIndex]
|
||||
* @param {number} [outputIndex]
|
||||
* @returns {Uint8Array}
|
||||
*/
|
||||
export function gunzip(input, out, inputIndex = 0, outputIndex = 0) {
|
||||
if (!(input.length - inputIndex)) return
|
||||
export function gunzip(input, output, inputIndex = 0, outputIndex = 0) {
|
||||
let out = output ?? new Uint8Array(1024) // initial size
|
||||
if (!(input.length - inputIndex)) return out
|
||||
const payloadStart = gzipStart(input, inputIndex)
|
||||
if (payloadStart === input.length - 8) return
|
||||
if (payloadStart === input.length - 8) return out
|
||||
if (payloadStart > input.length - 8) throw new Error('unexpected EOF')
|
||||
let pos = payloadStart * 8 // position in bits
|
||||
let final = 0 // last chunk?
|
||||
@ -84,6 +86,16 @@ export function gunzip(input, out, inputIndex = 0, outputIndex = 0) {
|
||||
let lengthMap
|
||||
let distMap
|
||||
const totalBits = input.length * 8
|
||||
|
||||
/** @param {number} length */
|
||||
function ensureSize(length) {
|
||||
if (!output && length > out.length) {
|
||||
const old = out
|
||||
out = new Uint8Array(Math.max(old.length * 2, length))
|
||||
out.set(old)
|
||||
}
|
||||
}
|
||||
|
||||
do {
|
||||
if (!lengthMap) {
|
||||
// final chunk is next?
|
||||
@ -98,6 +110,7 @@ export function gunzip(input, out, inputIndex = 0, outputIndex = 0) {
|
||||
const t = s + l
|
||||
if (t > input.length) throw new Error('unexpected EOF')
|
||||
// copy uncompressed data
|
||||
ensureSize(outputIndex + l)
|
||||
out.set(input.subarray(s, t), outputIndex)
|
||||
outputIndex += l
|
||||
pos = t * 8
|
||||
@ -160,6 +173,8 @@ export function gunzip(input, out, inputIndex = 0, outputIndex = 0) {
|
||||
} else throw new Error('invalid block type')
|
||||
if (pos > totalBits) throw new Error('unexpected EOF')
|
||||
}
|
||||
|
||||
ensureSize(outputIndex + 131072) // max chunk size?
|
||||
const lms = (1 << lengthBits) - 1
|
||||
const dms = (1 << distBits) - 1
|
||||
let lpos = pos
|
||||
@ -199,6 +214,7 @@ export function gunzip(input, out, inputIndex = 0, outputIndex = 0) {
|
||||
if (pos > totalBits) throw new Error('unexpected EOF')
|
||||
const end = outputIndex + add
|
||||
if (outputIndex < dt) throw new Error('unexpected dictionary case')
|
||||
ensureSize(end)
|
||||
for (; outputIndex < end; outputIndex++) out[outputIndex] = out[outputIndex - dt]
|
||||
}
|
||||
}
|
||||
@ -211,4 +227,7 @@ export function gunzip(input, out, inputIndex = 0, outputIndex = 0) {
|
||||
const nextBlock = Math.ceil(pos / 8) + 8 // 8 byte gzip footer
|
||||
gunzip(input, out, nextBlock, outputIndex)
|
||||
}
|
||||
|
||||
if (!output) return out.subarray(0, outputIndex)
|
||||
return out
|
||||
}
|
||||
|
||||
@ -54,4 +54,10 @@ describe('gzip compressor', () => {
|
||||
expect(toJson(data)).toEqual(JSON.parse(expected))
|
||||
} })
|
||||
})
|
||||
|
||||
it('read gzip with unknown length', () => {
|
||||
const input = new Uint8Array([31, 139, 8, 0, 77, 204, 77, 102, 0, 3, 227, 230, 22, 83, 4, 0, 117, 18, 225, 170, 4, 0, 0, 0])
|
||||
const resized = gunzip(input)
|
||||
expect(resized).toEqual(new Uint8Array([11, 11, 22, 33]))
|
||||
})
|
||||
})
|
||||
|
||||
Loading…
Reference in New Issue
Block a user