From 97359f877b1dd9f6df68af8dda204e13f9bfca74 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Wed, 22 May 2024 03:30:22 -0700 Subject: [PATCH] Gzip implementation --- .eslintrc.json | 3 + README.md | 4 +- package.json | 10 +- src/gzip.js | 314 ++++++++++++++++++++++++++++++++++++++++++++++ src/index.js | 8 +- test/gzip.test.js | 42 ++++++- 6 files changed, 370 insertions(+), 11 deletions(-) create mode 100644 src/gzip.js diff --git a/.eslintrc.json b/.eslintrc.json index f276322..f89c4b4 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -46,6 +46,9 @@ "no-extra-parens": "error", "no-multi-spaces": "error", "no-trailing-spaces": "error", + "no-useless-concat": "error", + "no-useless-rename": "error", + "no-useless-return": "error", "no-var": "error", "object-curly-spacing": ["error", "always"], "prefer-const": "error", diff --git a/README.md b/README.md index 060c4a0..921898d 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![npm](https://img.shields.io/npm/v/hyparquet-compressors)](https://www.npmjs.com/package/hyparquet-compressors) [![workflow status](https://github.com/hyparam/hyparquet-compressors/actions/workflows/ci.yml/badge.svg)](https://github.com/hyparam/hyparquet-compressors/actions) [![mit license](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT) -![coverage](https://img.shields.io/badge/Coverage-97-darkred) +![coverage](https://img.shields.io/badge/Coverage-93-darkred) This package exports a `compressors` object intended to be passed into [hyparquet](https://github.com/hyparam/hyparquet). @@ -44,7 +44,7 @@ Parquet compression types supported with `hyparquet-compressors`: - https://en.wikipedia.org/wiki/LZ4_(compression_algorithm) - https://en.wikipedia.org/wiki/Snappy_(compression) - https://en.wikipedia.org/wiki/Zstd + - https://github.com/101arrowz/fflate - https://github.com/101arrowz/fzstd - https://github.com/foliojs/brotli.js - https://github.com/hyparam/hysnappy - - https://github.com/nodeca/pako diff --git a/package.json b/package.json index 5ea8a6e..09e9399 100644 --- a/package.json +++ b/package.json @@ -33,23 +33,21 @@ "dependencies": { "brotli": "1.3.3", "fzstd": "0.1.1", - "hysnappy": "0.3.1", - "pako": "2.1.0" + "hysnappy": "0.3.1" }, "devDependencies": { "@babel/eslint-parser": "7.24.5", - "@rollup/plugin-commonjs": "25.0.7", + "@rollup/plugin-commonjs": "25.0.8", "@rollup/plugin-node-resolve": "15.2.3", "@rollup/plugin-terser": "0.4.4", "@types/brotli": "1.3.4", "@types/node": "20.12.12", - "@types/pako": "2.0.3", "@vitest/coverage-v8": "1.6.0", "eslint": "8.57.0", "eslint-plugin-import": "2.29.1", "eslint-plugin-jsdoc": "48.2.5", - "hyparquet": "0.9.4", - "rollup": "4.17.2", + "hyparquet": "0.9.5", + "rollup": "4.18.0", "typescript": "5.4.5", "vitest": "1.6.0" } diff --git a/src/gzip.js b/src/gzip.js new file mode 100644 index 0000000..decf75a --- /dev/null +++ b/src/gzip.js @@ -0,0 +1,314 @@ +// Adapted from https://github.com/101arrowz/fflate Copyright (c) 2023 Arjun Barrett +// https://tools.ietf.org/html/rfc1951 + +// fixed length extra bits +const fixedLengthExtraBits = new Uint8Array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, /* unused */ 0, 0, /* impossible */ 0]) +const fixedDistanceExtraBits = new Uint8Array([0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, /* unused */ 0, 0]) +const codeLengthIndexMap = new Uint8Array([16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15]) + +/** + * get base, reverse index map from extra bits + * @param {Uint8Array} eb + * @param {number} start + * @returns {{base: Uint16Array, rev: Int32Array}} + */ +function freb(eb, start) { + const base = new Uint16Array(31) + for (let i = 0; i < 31; i++) { + base[i] = start += 1 << eb[i - 1] + } + // numbers here are max 18 bits + const rev = new Int32Array(base[30]) + for (let i = 1; i < 30; i++) { + for (let j = base[i]; j < base[i + 1]; ++j) { + rev[j] = j - base[i] << 5 | i + } + } + return { base, rev } +} + +const { base: fl, rev: revfl } = freb(fixedLengthExtraBits, 2) +// we can ignore the fact that the other numbers are wrong; they never happen anyway +fl[28] = 258 +revfl[258] = 28 +const { base: fd } = freb(fixedDistanceExtraBits, 0) + +// map of value to reverse (assuming 16 bits) +const rev = new Uint16Array(32768) +for (let i = 0; i < 32768; i++) { + // reverse table algorithm from SO + let x = (i & 0xAAAA) >> 1 | (i & 0x5555) << 1 + x = (x & 0xCCCC) >> 2 | (x & 0x3333) << 2 + x = (x & 0xF0F0) >> 4 | (x & 0x0F0F) << 4 + rev[i] = ((x & 0xFF00) >> 8 | (x & 0x00FF) << 8) >> 1 +} + +/** + * create huffman tree from Uint8Array "map": index -> code length for code index + * maxBits must be at most 15 + * @param {Uint8Array} cd + * @param {number} maxBits + * @param {0 | 1} r + * @returns {Uint16Array} + */ +function huffMap(cd, maxBits, r) { + // u16 "map": index -> # of codes with bit length = index + const l = new Uint16Array(maxBits) + // length of cd must be 288 (total # of codes) + for (let i = 0; i < cd.length; i++) { + if (cd[i]) ++l[cd[i] - 1] + } + // u16 "map": index -> minimum code for bit length = index + const le = new Uint16Array(maxBits) + for (let i = 1; i < maxBits; i++) { + le[i] = le[i - 1] + l[i - 1] << 1 + } + let co + if (r) { + // u16 "map": index -> number of actual bits, symbol for code + co = new Uint16Array(1 << maxBits) + // bits to remove for reverser + const rvb = 15 - maxBits + for (let i = 0; i < cd.length; i++) { + // ignore 0 lengths + if (cd[i]) { + // num encoding both symbol and bits read + const sv = i << 4 | cd[i] + const freeBits = maxBits - cd[i] + let startValue = le[cd[i] - 1]++ << freeBits + for (const endValue = startValue | (1 << freeBits) - 1; startValue <= endValue; startValue++) { + // every 16 bit value starting with the code yields the same result + co[rev[startValue] >> rvb] = sv + } + } + } + } else { + co = new Uint16Array(cd.length) + for (let i = 0; i < cd.length; i++) { + if (cd[i]) { + co[i] = rev[le[cd[i] - 1]++] >> 15 - cd[i] + } + } + } + return co +} + +// construct huffman trees +const fixedLengthTree = new Uint8Array(288) +for (let i = 0; i < 144; i++) fixedLengthTree[i] = 8 +for (let i = 144; i < 256; i++) fixedLengthTree[i] = 9 +for (let i = 256; i < 280; i++) fixedLengthTree[i] = 7 +for (let i = 280; i < 288; i++) fixedLengthTree[i] = 8 +const fixedDistanceTree = new Uint8Array(32) +for (let i = 0; i < 32; i++) fixedDistanceTree[i] = 5 +const fixedLengthMap = /*#__PURE__*/ huffMap(fixedLengthTree, 9, 1) +const fixedDistanceMap = /*#__PURE__*/ huffMap(fixedDistanceTree, 5, 1) + +/** + * find max of array + * @param {Uint8Array | number[]} a + * @returns {number} + */ +function max(a) { + let m = a[0] + for (let i = 1; i < a.length; i++) { + if (a[i] > m) m = a[i] + } + return m +} + +/** + * read d, starting at bit p and mask with m + * @param {Uint8Array} input + * @param {number} pos + * @param {number} mask + * @returns {number} + */ +function bits(input, pos, mask) { + const o = pos / 8 | 0 + return (input[o] | input[o + 1] << 8) >> (pos & 7) & mask +} + +/** + * read d, starting at bit p continuing for at least 16 bits + * @param {Uint8Array} d + * @param {number} p + * @returns {number} + */ +function bits16(d, p) { + const o = p / 8 | 0 + return (d[o] | d[o + 1] << 8 | d[o + 2] << 16) >> (p & 7) +} + +/** + * get end of byte + * @param {number} p + * @returns {number} + */ +function shft(p) { + return (p + 7) / 8 | 0 +} + +/** + * return start of gzip payload index + * @param {Uint8Array} input + * @param {number} i inputIndex + * @returns {number} + */ +function gzipStart(input, i) { + if (input[i++] !== 31 || input[i++] !== 139 || input[i++] !== 8) throw new Error('invalid gzip data') + const flag = input[i++] + i += 6 + if (flag & 4) i += (input[i + 10] | input[i + 11] << 8) + 2 + for (let zs = (flag >> 3 & 1) + (flag >> 4 & 1); zs > 0; zs -= Number(!input[i++])); + return i + (flag & 2) +} + +/** + * GZip decompression + * @param {Uint8Array} input + * @param {Uint8Array} out + * @param {number} [inputIndex] + * @param {number} [outputIndex] + */ +export function gunzip(input, out, inputIndex = 0, outputIndex = 0) { + if (!(input.length - inputIndex)) return + const payloadStart = gzipStart(input, inputIndex) + if (payloadStart === input.length - 8) return + if (payloadStart > input.length - 8) throw new Error('unexpected EOF') + let pos = payloadStart * 8 // position in bits + let final = 0 // last chunk? + let lengthBits = 0 + let distBits = 0 + let lmap + let dmap + const totalBits = input.length * 8 + do { + if (!lmap) { + // final chunk is next? + final = bits(input, pos, 1) + const type = bits(input, pos + 1, 3) + pos += 3 + if (!type) { + // no compression + // go to end of byte boundary + const s = shft(pos) + 4 + const l = input[s - 4] | input[s - 3] << 8 + const t = s + l + if (t > input.length) throw new Error('unexpected EOF') + // copy uncompressed data + out.set(input.subarray(s, t), outputIndex) + outputIndex += l + pos = t * 8 + continue + } else if (type === 1) { + // fixed huffman + lmap = fixedLengthMap + dmap = fixedDistanceMap + lengthBits = 9 + distBits = 5 + } else if (type === 2) { + // dynamic huffman + const hLiteral = bits(input, pos, 31) + 257 + const hcLengths = bits(input, pos + 10, 15) + 4 + const tl = hLiteral + bits(input, pos + 5, 31) + 1 + pos += 14 + // length+distance tree + const lengthDistanceTree = new Uint8Array(tl) + const codeLengthTree = new Uint8Array(19) + for (let i = 0; i < hcLengths; ++i) { + // use index map to get real code + codeLengthTree[codeLengthIndexMap[i]] = bits(input, pos + i * 3, 7) + } + pos += hcLengths * 3 + const codeLengthBits = max(codeLengthTree) + const clbMask = (1 << codeLengthBits) - 1 + const codeLengthMap = huffMap(codeLengthTree, codeLengthBits, 1) + for (let i = 0; i < tl;) { + const r = codeLengthMap[bits(input, pos, clbMask)] + // bits read + pos += r & 15 + const symbol = r >> 4 + // code length to copy + if (symbol < 16) { + lengthDistanceTree[i++] = symbol + } else { + let copy = 0 + let n = 0 // count + if (symbol === 16) { + n = 3 + bits(input, pos, 3) + pos += 2 + copy = lengthDistanceTree[i - 1] + } else if (symbol === 17) { + n = 3 + bits(input, pos, 7) + pos += 3 + } else if (symbol === 18) { + n = 11 + bits(input, pos, 127) + pos += 7 + } + while (n--) lengthDistanceTree[i++] = copy + } + } + const lengthTree = lengthDistanceTree.subarray(0, hLiteral) + const distanceTree = lengthDistanceTree.subarray(hLiteral) + // max length bits + lengthBits = max(lengthTree) + // max dist bits + distBits = max(distanceTree) + lmap = huffMap(lengthTree, lengthBits, 1) + dmap = huffMap(distanceTree, distBits, 1) + } else throw new Error('invalid block type') + if (pos > totalBits) throw new Error('unexpected EOF') + } + const lms = (1 << lengthBits) - 1 + const dms = (1 << distBits) - 1 + let lpos = pos + for (;; lpos = pos) { + // bits read, code + const code = lmap[bits16(input, pos) & lms] + const sym = code >> 4 + pos += code & 15 + if (pos > totalBits) throw new Error('unexpected EOF') + if (!code) throw new Error('invalid length/literal') + if (sym < 256) out[outputIndex++] = sym + else if (sym === 256) { + lpos = pos + lmap = undefined + break + } else { + let add = sym - 254 + // no extra bits needed if less + if (sym > 264) { + const index = sym - 257 + const b = fixedLengthExtraBits[index] + add = bits(input, pos, (1 << b) - 1) + fl[index] + pos += b + } + // dist + if (!dmap) throw new Error('invalid distance map') + const d = dmap[bits16(input, pos) & dms] + const dsym = d >> 4 + if (!d) throw new Error('invalid distance') + pos += d & 15 + let dt = fd[dsym] + if (dsym > 3) { + const b = fixedDistanceExtraBits[dsym] + dt += bits16(input, pos) & (1 << b) - 1 + pos += b + } + if (pos > totalBits) throw new Error('unexpected EOF') + const end = outputIndex + add + if (outputIndex < dt) throw new Error('unexpected dictionary case') + for (; outputIndex < end; outputIndex++) out[outputIndex] = out[outputIndex - dt] + } + } + pos = lpos + if (lmap) final = 1 + } while (!final) + + if (outputIndex < out.length) { + // multiple gzip blocks + const nextBlock = Math.ceil(pos / 8) + 8 // 8 byte gzip footer + gunzip(input, out, nextBlock, outputIndex) + } +} diff --git a/src/index.js b/src/index.js index 2c91c96..8edf938 100644 --- a/src/index.js +++ b/src/index.js @@ -1,7 +1,7 @@ import BROTLI from 'brotli/decompress.js' import { decompress as ZSTD } from 'fzstd' import { snappyUncompressor } from 'hysnappy' -import pako from 'pako' +import { gunzip } from './gzip.js' import { LZ4, LZ4_RAW } from './lz4.js' /** @@ -9,7 +9,11 @@ import { LZ4, LZ4_RAW } from './lz4.js' */ export const compressors = { SNAPPY: snappyUncompressor(), - GZIP: input => pako.ungzip(input), + GZIP: (input, length) => { + const out = new Uint8Array(length) + gunzip(input, out) + return out + }, // @ts-expect-error brotli expects Buffer but Uint8Array works BROTLI, ZSTD: input => ZSTD(input), diff --git a/test/gzip.test.js b/test/gzip.test.js index 941f5bf..6b9b1bc 100644 --- a/test/gzip.test.js +++ b/test/gzip.test.js @@ -1,10 +1,50 @@ import fs from 'fs' import { parquetRead, toJson } from 'hyparquet' import { describe, expect, it } from 'vitest' +import { gunzip } from '../src/gzip.js' import { compressors } from '../src/index.js' describe('gzip compressor', () => { - it('should read gzip compressed parquet file', async () => { + it('read empty gzip data', () => { + const input = new Uint8Array(0) + const output = new Uint8Array(0) + gunzip(input, output) + }) + + it('read empty gzip block', () => { + const input = new Uint8Array([31, 139, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) + const output = new Uint8Array(0) + gunzip(input, output) + }) + + it('read gzip block', () => { + const input = new Uint8Array([31, 139, 8, 0, 77, 204, 77, 102, 0, 3, 227, 230, 22, 83, 4, 0, 117, 18, 225, 170, 4, 0, 0, 0]) + const output = new Uint8Array(4) + gunzip(input, output) + expect(output).toEqual(new Uint8Array([11, 11, 22, 33])) + }) + + it('read gzip repeated block', () => { + const input = new Uint8Array([ + 31, 139, 8, 0, 142, 75, 78, 102, + 0, 3, 237, 192, 1, 13, 0, 0, + 0, 194, 160, 62, 246, 15, 104, 143, + 15, 6, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 7, 69, 96, 68, + 21, 16, 39, 0, 0, + ]) + const output = new Uint8Array(10000) + gunzip(input, output) + expect(output).toEqual(new Uint8Array(new Array(10000).fill(42))) + }) + + it('throw error on invalid gzip data', () => { + const input = new Uint8Array([31, 139, 8, 4, 0, 0, 0, 0, 0, 255, 255, 0, 0, 0, 0, 0, 0]) + const output = new Uint8Array(4) + expect(() => gunzip(input, output)).toThrow('unexpected EOF') + }) + + it('read gzip compressed parquet file', async () => { const buffer = fs.readFileSync('test/files/concatenated_gzip_members.parquet') const file = new Uint8Array(buffer).buffer const expected = fs.readFileSync('test/files/concatenated_gzip_members.json').toString()