hyparquet-compressors/src/lz4.js

89 lines
2.8 KiB
JavaScript
Raw Normal View History

2024-05-20 00:14:16 +00:00
/**
2024-05-20 04:09:36 +00:00
* LZ4 decompression with legacy hadoop support.
* https://github.com/apache/arrow/blob/apache-arrow-16.1.0/cpp/src/arrow/util/compression_lz4.cc#L475
2024-05-20 00:14:16 +00:00
*
* @param {Uint8Array} input
* @param {number} outputLength
* @returns {Uint8Array}
*/
export function decompressLz4(input, outputLength) {
2024-05-20 00:14:16 +00:00
const output = new Uint8Array(outputLength)
2024-05-20 04:09:36 +00:00
try {
let i = 0 // input index
let o = 0 // output index
while (i < input.length - 8) {
const expectedOutputLength = input[i++] << 24 | input[i++] << 16 | input[i++] << 8 | input[i++]
const expectedInputLength = input[i++] << 24 | input[i++] << 16 | input[i++] << 8 | input[i++]
if (input.length - i < expectedInputLength) throw new Error('lz4 not hadoop')
if (output.length < expectedOutputLength) throw new Error('lz4 not hadoop')
// decompress and compare with expected
const chunk = lz4basic(input.subarray(i, i + expectedInputLength), output, o)
if (chunk !== expectedOutputLength) throw new Error('lz4 not hadoop')
i += expectedInputLength
o += expectedOutputLength
if (i === input.length) return output
}
if (i < input.length) throw new Error('lz4 not hadoop')
} catch (error) {
if (error instanceof Error && error.message !== 'lz4 not hadoop') throw error
// fallback to basic lz4
lz4basic(input, output, 0)
}
return output
}
/**
* Basic LZ4 block decompression.
*
* @param {Uint8Array} input
* @param {number} outputLength
* @returns {Uint8Array}
*/
export function decompressLz4Raw(input, outputLength) {
2024-05-20 04:09:36 +00:00
const output = new Uint8Array(outputLength)
lz4basic(input, output, 0)
return output
}
/**
* @param {Uint8Array} input
* @param {Uint8Array} output
* @param {number} outputIndex
* @returns {number} bytes written
*/
function lz4basic(input, output, outputIndex) {
let len = outputIndex // output position
2024-05-20 00:14:16 +00:00
for (let i = 0; i < input.length;) {
const token = input[i++]
let literals = token >> 4
if (literals) {
// literal length
let byte = literals + 240
while (byte === 255) literals += byte = input[i++]
// copy literals
output.set(input.subarray(i, i + literals), len)
len += literals
i += literals
2024-05-20 04:09:36 +00:00
if (i >= input.length) return len - outputIndex
2024-05-20 00:14:16 +00:00
}
const offset = input[i++] | input[i++] << 8
2024-05-20 04:09:36 +00:00
if (!offset || offset > len) {
throw new Error(`lz4 offset out of range ${offset}`)
}
2024-05-20 00:14:16 +00:00
// match length
2024-05-20 04:09:36 +00:00
let matchLength = (token & 0xf) + 4 // minmatch 4
2024-05-20 00:14:16 +00:00
let byte = matchLength + 240
while (byte === 255) matchLength += byte = input[i++]
// copy match
2024-05-20 04:09:36 +00:00
// TODO: fast path when no overlap
2024-05-20 00:14:16 +00:00
let pos = len - offset
const end = len + matchLength
while (len < end) output[len++] = output[pos++]
}
2024-05-20 04:09:36 +00:00
return len - outputIndex
2024-05-20 00:14:16 +00:00
}