hyparquet-compressors/src/lz4.js

/**
 * LZ4 decompression with legacy hadoop support.
 * https://github.com/apache/arrow/blob/apache-arrow-16.1.0/cpp/src/arrow/util/compression_lz4.cc#L475
 *
 * @param {Uint8Array} input
 * @param {number} outputLength
 * @returns {Uint8Array}
 */
export function decompressLz4(input, outputLength) {
  const output = new Uint8Array(outputLength)
  try {
    let i = 0 // input index
    let o = 0 // output index
    while (i < input.length - 8) {
      const expectedOutputLength = input[i++] << 24 | input[i++] << 16 | input[i++] << 8 | input[i++]
      const expectedInputLength = input[i++] << 24 | input[i++] << 16 | input[i++] << 8 | input[i++]
      if (input.length - i < expectedInputLength) throw new Error('lz4 not hadoop')
      if (output.length < expectedOutputLength) throw new Error('lz4 not hadoop')

      // decompress and compare with expected
      const chunk = lz4basic(input.subarray(i, i + expectedInputLength), output, o)
      if (chunk !== expectedOutputLength) throw new Error('lz4 not hadoop')
      i += expectedInputLength
      o += expectedOutputLength

      if (i === input.length) return output
    }
    if (i < input.length) throw new Error('lz4 not hadoop')
  } catch (error) {
    if (error instanceof Error && error.message !== 'lz4 not hadoop') throw error
    // fallback to basic lz4
    lz4basic(input, output, 0)
  }
  return output
}

/**
 * Basic LZ4 block decompression.
 *
 * @param {Uint8Array} input
 * @param {number} outputLength
 * @returns {Uint8Array}
 */
export function decompressLz4Raw(input, outputLength) {
  const output = new Uint8Array(outputLength)
  lz4basic(input, output, 0)
  return output
}

/**
 * @param {Uint8Array} input
 * @param {Uint8Array} output
 * @param {number} outputIndex
 * @returns {number} bytes written
 */
function lz4basic(input, output, outputIndex) {
  let len = outputIndex // output position
  for (let i = 0; i < input.length;) {
    const token = input[i++]

    let literals = token >> 4
    if (literals) {
      // literal length
      let byte = literals + 240
      while (byte === 255) literals += byte = input[i++]
      // copy literals
      output.set(input.subarray(i, i + literals), len)
      len += literals
      i += literals
      if (i >= input.length) return len - outputIndex
    }

    const offset = input[i++] | input[i++] << 8
    if (!offset || offset > len) {
      throw new Error(`lz4 offset out of range ${offset}`)
    }
    // match length
    let matchLength = (token & 0xf) + 4 // minmatch 4
    let byte = matchLength + 240
    while (byte === 255) matchLength += byte = input[i++]
    // copy match
    // TODO: fast path when no overlap
    let pos = len - offset
    const end = len + matchLength
    while (len < end) output[len++] = output[pos++]
  }
  return len - outputIndex
}
Native lz4 2024-05-20 00:14:16 +00:00			`/**`
Fix hadoop lz4 2024-05-20 04:09:36 +00:00			`* LZ4 decompression with legacy hadoop support.`
			`* https://github.com/apache/arrow/blob/apache-arrow-16.1.0/cpp/src/arrow/util/compression_lz4.cc#L475`
Native lz4 2024-05-20 00:14:16 +00:00			`*`
			`* @param {Uint8Array} input`
			`* @param {number} outputLength`
			`* @returns {Uint8Array}`
			`*/`
Split out exports for more efficient packaging 2025-03-20 06:02:32 +00:00			`export function decompressLz4(input, outputLength) {`
Native lz4 2024-05-20 00:14:16 +00:00			`const output = new Uint8Array(outputLength)`
Fix hadoop lz4 2024-05-20 04:09:36 +00:00			`try {`
			`let i = 0 // input index`
			`let o = 0 // output index`
			`while (i < input.length - 8) {`
			`const expectedOutputLength = input[i++] << 24 \| input[i++] << 16 \| input[i++] << 8 \| input[i++]`
			`const expectedInputLength = input[i++] << 24 \| input[i++] << 16 \| input[i++] << 8 \| input[i++]`
			`if (input.length - i < expectedInputLength) throw new Error('lz4 not hadoop')`
			`if (output.length < expectedOutputLength) throw new Error('lz4 not hadoop')`

			`// decompress and compare with expected`
			`const chunk = lz4basic(input.subarray(i, i + expectedInputLength), output, o)`
			`if (chunk !== expectedOutputLength) throw new Error('lz4 not hadoop')`
			`i += expectedInputLength`
			`o += expectedOutputLength`

			`if (i === input.length) return output`
			`}`
			`if (i < input.length) throw new Error('lz4 not hadoop')`
			`} catch (error) {`
			`if (error instanceof Error && error.message !== 'lz4 not hadoop') throw error`
			`// fallback to basic lz4`
			`lz4basic(input, output, 0)`
			`}`
			`return output`
			`}`

			`/**`
			`* Basic LZ4 block decompression.`
			`*`
			`* @param {Uint8Array} input`
			`* @param {number} outputLength`
			`* @returns {Uint8Array}`
			`*/`
Split out exports for more efficient packaging 2025-03-20 06:02:32 +00:00			`export function decompressLz4Raw(input, outputLength) {`
Fix hadoop lz4 2024-05-20 04:09:36 +00:00			`const output = new Uint8Array(outputLength)`
			`lz4basic(input, output, 0)`
			`return output`
			`}`

			`/**`
			`* @param {Uint8Array} input`
			`* @param {Uint8Array} output`
			`* @param {number} outputIndex`
			`* @returns {number} bytes written`
			`*/`
			`function lz4basic(input, output, outputIndex) {`
			`let len = outputIndex // output position`
Native lz4 2024-05-20 00:14:16 +00:00			`for (let i = 0; i < input.length;) {`
			`const token = input[i++]`

			`let literals = token >> 4`
			`if (literals) {`
			`// literal length`
			`let byte = literals + 240`
			`while (byte === 255) literals += byte = input[i++]`
			`// copy literals`
			`output.set(input.subarray(i, i + literals), len)`
			`len += literals`
			`i += literals`
Fix hadoop lz4 2024-05-20 04:09:36 +00:00			`if (i >= input.length) return len - outputIndex`
Native lz4 2024-05-20 00:14:16 +00:00			`}`

			`const offset = input[i++] \| input[i++] << 8`
Fix hadoop lz4 2024-05-20 04:09:36 +00:00			`if (!offset \|\| offset > len) {`
			throw new Error(`lz4 offset out of range ${offset}`)
			`}`
Native lz4 2024-05-20 00:14:16 +00:00			`// match length`
Fix hadoop lz4 2024-05-20 04:09:36 +00:00			`let matchLength = (token & 0xf) + 4 // minmatch 4`
Native lz4 2024-05-20 00:14:16 +00:00			`let byte = matchLength + 240`
			`while (byte === 255) matchLength += byte = input[i++]`
			`// copy match`
Fix hadoop lz4 2024-05-20 04:09:36 +00:00			`// TODO: fast path when no overlap`
Native lz4 2024-05-20 00:14:16 +00:00			`let pos = len - offset`
			`const end = len + matchLength`
			`while (len < end) output[len++] = output[pos++]`
			`}`
Fix hadoop lz4 2024-05-20 04:09:36 +00:00			`return len - outputIndex`
Native lz4 2024-05-20 00:14:16 +00:00			`}`