Write BitPacked

This commit is contained in:
Kenny Daniel 2025-03-25 13:36:18 -07:00
parent 045256b478
commit 76889680b9
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
4 changed files with 135 additions and 0 deletions

71
src/encoding.js Normal file

@ -0,0 +1,71 @@
/**
* @import {Writer} from './writer.js'
* @param {Writer} writer
* @param {number[]} values
*/
export function writeRleBitPackedHybrid(writer, values) {
// find max bitwidth
const bitWidth = Math.ceil(Math.log2(Math.max(...values) + 1))
// TODO: Try both RLE and bit-packed and choose the best
writeBitPacked(writer, values, bitWidth)
}
/**
* @param {Writer} writer
* @param {number[]} values
* @param {number} bitWidth
*/
function writeBitPacked(writer, values, bitWidth) {
// Number of 8-value groups
const numGroups = Math.ceil(values.length / 8)
// The parquet bitpack header: lower bit = 1 => "bit-packed mode"
// upper bits = number of groups
const header = numGroups << 1 | 1
// Write the header as a varint
writer.appendVarInt(header)
// If bitWidth = 0, no data is actually needed
if (bitWidth === 0 || values.length === 0) {
return
}
const mask = (1 << bitWidth) - 1
let buffer = 0 // accumulates bits
let bitsUsed = 0 // how many bits are in 'buffer' so far
// Write out each value, bit-packing into buffer
for (let i = 0; i < values.length; i++) {
const v = values[i] & mask // mask off bits exceeding bitWidth
buffer |= v << bitsUsed
bitsUsed += bitWidth
// Flush full bytes
while (bitsUsed >= 8) {
writer.appendUint8(buffer & 0xFF)
buffer >>>= 8
bitsUsed -= 8
}
}
// Pad the final partial group with zeros if needed
const totalNeeded = numGroups * 8
for (let padCount = values.length; padCount < totalNeeded; padCount++) {
// Just write a 0 value into the buffer
buffer |= 0 << bitsUsed
bitsUsed += bitWidth
while (bitsUsed >= 8) {
writer.appendUint8(buffer & 0xFF)
buffer >>>= 8
bitsUsed -= 8
}
}
// Flush any remaining bits
if (bitsUsed > 0) {
writer.appendUint8(buffer & 0xFF)
}
}

1
src/types.d.ts vendored

@ -2,6 +2,7 @@
export interface Writer {
buffer: ArrayBuffer
offset: number
getBuffer(): ArrayBuffer
appendUint8(value: number): void
appendUint32(value: number): void
appendFloat64(value: number): void

@ -25,6 +25,10 @@ Writer.prototype.ensure = function(size) {
}
}
Writer.prototype.getBuffer = function() {
return this.buffer.slice(0, this.offset)
}
/**
* @param {number} value
*/

59
test/encoding.test.js Normal file

@ -0,0 +1,59 @@
import { describe, expect, it } from 'vitest'
import { Writer } from '../src/writer.js'
import { writeRleBitPackedHybrid } from '../src/encoding.js'
import { readRleBitPackedHybrid } from 'hyparquet/src/encoding.js'
/**
* Round-trip serialize and deserialize the given values.
*
* @param {number[]} values
* @returns {number[]}
*/
function roundTripDeserialize(values) {
const bitWidth = Math.ceil(Math.log2(Math.max(...values) + 1))
// Serialize the values using writeRleBitPackedHybrid
const writer = new Writer()
writeRleBitPackedHybrid(writer, values)
const buffer = writer.getBuffer()
const reader = { view: new DataView(buffer), offset: 0 }
// Decode the values using readRleBitPackedHybrid from hyparquet
/** @type {number[]} */
const output = new Array(values.length)
readRleBitPackedHybrid(reader, bitWidth, values.length, output)
return output
}
describe('RLE bit-packed hybrid', () => {
it('should round-trip a typical array of values', () => {
const original = [1, 2, 3, 4, 5, 6, 7, 8, 9]
const decoded = roundTripDeserialize(original)
expect(decoded).toEqual(original)
})
it('should round-trip an empty array', () => {
const decoded = roundTripDeserialize([])
expect(decoded).toEqual([])
})
it('should round-trip an array of zeros', () => {
const original = [0, 0, 0, 0, 0, 0, 0, 0]
const decoded = roundTripDeserialize(original)
expect(decoded).toEqual(original)
})
it('should round-trip an array with large numbers', () => {
const original = [1023, 511, 255, 127, 63, 31, 15, 7]
const decoded = roundTripDeserialize(original)
expect(decoded).toEqual(original)
})
it('should round-trip a random array of values', () => {
const original = Array.from({ length: 20 }, () =>
Math.floor(Math.random() * 1000)
)
const decoded = roundTripDeserialize(original)
expect(decoded).toEqual(original)
})
})