From 76889680b95c80346b62a2718e355c10b521c250 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Tue, 25 Mar 2025 13:36:18 -0700 Subject: [PATCH] Write BitPacked --- src/encoding.js | 71 +++++++++++++++++++++++++++++++++++++++++++ src/types.d.ts | 1 + src/writer.js | 4 +++ test/encoding.test.js | 59 +++++++++++++++++++++++++++++++++++ 4 files changed, 135 insertions(+) create mode 100644 src/encoding.js create mode 100644 test/encoding.test.js diff --git a/src/encoding.js b/src/encoding.js new file mode 100644 index 0000000..dc154de --- /dev/null +++ b/src/encoding.js @@ -0,0 +1,71 @@ + +/** + * @import {Writer} from './writer.js' + * @param {Writer} writer + * @param {number[]} values + */ +export function writeRleBitPackedHybrid(writer, values) { + // find max bitwidth + const bitWidth = Math.ceil(Math.log2(Math.max(...values) + 1)) + + // TODO: Try both RLE and bit-packed and choose the best + writeBitPacked(writer, values, bitWidth) +} + +/** + * @param {Writer} writer + * @param {number[]} values + * @param {number} bitWidth + */ +function writeBitPacked(writer, values, bitWidth) { + // Number of 8-value groups + const numGroups = Math.ceil(values.length / 8) + + // The parquet bitpack header: lower bit = 1 => "bit-packed mode" + // upper bits = number of groups + const header = numGroups << 1 | 1 + + // Write the header as a varint + writer.appendVarInt(header) + + // If bitWidth = 0, no data is actually needed + if (bitWidth === 0 || values.length === 0) { + return + } + + const mask = (1 << bitWidth) - 1 + let buffer = 0 // accumulates bits + let bitsUsed = 0 // how many bits are in 'buffer' so far + + // Write out each value, bit-packing into buffer + for (let i = 0; i < values.length; i++) { + const v = values[i] & mask // mask off bits exceeding bitWidth + buffer |= v << bitsUsed + bitsUsed += bitWidth + + // Flush full bytes + while (bitsUsed >= 8) { + writer.appendUint8(buffer & 0xFF) + buffer >>>= 8 + bitsUsed -= 8 + } + } + + // Pad the final partial group with zeros if needed + const totalNeeded = numGroups * 8 + for (let padCount = values.length; padCount < totalNeeded; padCount++) { + // Just write a 0 value into the buffer + buffer |= 0 << bitsUsed + bitsUsed += bitWidth + while (bitsUsed >= 8) { + writer.appendUint8(buffer & 0xFF) + buffer >>>= 8 + bitsUsed -= 8 + } + } + + // Flush any remaining bits + if (bitsUsed > 0) { + writer.appendUint8(buffer & 0xFF) + } +} diff --git a/src/types.d.ts b/src/types.d.ts index 4e17945..048cc76 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -2,6 +2,7 @@ export interface Writer { buffer: ArrayBuffer offset: number + getBuffer(): ArrayBuffer appendUint8(value: number): void appendUint32(value: number): void appendFloat64(value: number): void diff --git a/src/writer.js b/src/writer.js index 858b6a7..6532ade 100644 --- a/src/writer.js +++ b/src/writer.js @@ -25,6 +25,10 @@ Writer.prototype.ensure = function(size) { } } +Writer.prototype.getBuffer = function() { + return this.buffer.slice(0, this.offset) +} + /** * @param {number} value */ diff --git a/test/encoding.test.js b/test/encoding.test.js new file mode 100644 index 0000000..ad508d1 --- /dev/null +++ b/test/encoding.test.js @@ -0,0 +1,59 @@ +import { describe, expect, it } from 'vitest' +import { Writer } from '../src/writer.js' +import { writeRleBitPackedHybrid } from '../src/encoding.js' +import { readRleBitPackedHybrid } from 'hyparquet/src/encoding.js' + +/** + * Round-trip serialize and deserialize the given values. + * + * @param {number[]} values + * @returns {number[]} + */ +function roundTripDeserialize(values) { + const bitWidth = Math.ceil(Math.log2(Math.max(...values) + 1)) + + // Serialize the values using writeRleBitPackedHybrid + const writer = new Writer() + writeRleBitPackedHybrid(writer, values) + const buffer = writer.getBuffer() + const reader = { view: new DataView(buffer), offset: 0 } + + // Decode the values using readRleBitPackedHybrid from hyparquet + /** @type {number[]} */ + const output = new Array(values.length) + readRleBitPackedHybrid(reader, bitWidth, values.length, output) + return output +} + +describe('RLE bit-packed hybrid', () => { + it('should round-trip a typical array of values', () => { + const original = [1, 2, 3, 4, 5, 6, 7, 8, 9] + const decoded = roundTripDeserialize(original) + expect(decoded).toEqual(original) + }) + + it('should round-trip an empty array', () => { + const decoded = roundTripDeserialize([]) + expect(decoded).toEqual([]) + }) + + it('should round-trip an array of zeros', () => { + const original = [0, 0, 0, 0, 0, 0, 0, 0] + const decoded = roundTripDeserialize(original) + expect(decoded).toEqual(original) + }) + + it('should round-trip an array with large numbers', () => { + const original = [1023, 511, 255, 127, 63, 31, 15, 7] + const decoded = roundTripDeserialize(original) + expect(decoded).toEqual(original) + }) + + it('should round-trip a random array of values', () => { + const original = Array.from({ length: 20 }, () => + Math.floor(Math.random() * 1000) + ) + const decoded = roundTripDeserialize(original) + expect(decoded).toEqual(original) + }) +})