Choose best of RLE or bit-packed for hybrid encoding

sparse booleans: 2634 -> 1399 bytes
This commit is contained in:
Kenny Daniel 2025-03-25 22:55:02 -07:00
parent 44d0d0c77a
commit 947a78f72d
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
2 changed files with 61 additions and 4 deletions

@ -1,6 +1,6 @@
import { Writer } from './writer.js'
/**
* @import {Writer} from './writer.js'
* @param {Writer} writer
* @param {number[]} values
* @returns {number} bytes written
@ -10,8 +10,18 @@ export function writeRleBitPackedHybrid(writer, values) {
// find max bitwidth
const bitWidth = Math.ceil(Math.log2(Math.max(...values) + 1))
// TODO: Try both RLE and bit-packed and choose the best
writeBitPacked(writer, values, bitWidth)
// try both RLE and bit-packed and choose the best
const rle = new Writer()
writeRle(rle, values, bitWidth)
const bitPacked = new Writer()
writeBitPacked(bitPacked, values, bitWidth)
if (rle.offset < bitPacked.offset) {
writer.appendBuffer(rle.getBuffer())
} else {
writer.appendBuffer(bitPacked.getBuffer())
}
return writer.offset - offsetStart
}
@ -69,6 +79,41 @@ function writeBitPacked(writer, values, bitWidth) {
// Flush any remaining bits
if (bitsUsed > 0) {
writer.appendUint8(buffer & 0xFF)
writer.appendUint8(buffer & 0xff)
}
}
/**
* Run-length encoding: write repeated values by encoding the value and its count.
*
* @param {Writer} writer
* @param {number[]} values
* @param {number} bitWidth
*/
function writeRle(writer, values, bitWidth) {
if (!values.length) return
let currentValue = values[0]
let count = 1
for (let i = 1; i <= values.length; i++) {
if (i < values.length && values[i] === currentValue) {
count++ // continue the run
} else {
// write the count of repeated values
writer.appendVarInt(count)
// write the value
const width = bitWidth + 7 >> 3 // bytes needed
for (let j = 0; j < width; j++) {
writer.appendUint8(currentValue >> (j << 3) & 0xff)
}
// reset for the next run
if (i < values.length) {
currentValue = values[i]
count = 1
}
}
}
}

@ -39,4 +39,16 @@ describe('parquetWrite', () => {
{ bool: false, int: 0x7fffffff, bigint: 0x7fffffffffffffffn, double: 1e100, string: 'd', nullable: null },
])
})
it('efficiently serializes sparse booleans', () => {
const bool = Array(10000).fill(null)
bool[10] = true
bool[100] = false
bool[500] = true
bool[9999] = false
const buffer = parquetWrite({ bool })
expect(buffer.byteLength).toBe(1399)
const metadata = parquetMetadata(buffer)
expect(metadata.metadata_length).toBe(89)
})
})