hyparquet-writer/src/encoding.js

72 lines
1.8 KiB
JavaScript
Raw Normal View History

2025-03-25 20:36:18 +00:00
/**
* @import {Writer} from './writer.js'
* @param {Writer} writer
* @param {number[]} values
*/
export function writeRleBitPackedHybrid(writer, values) {
// find max bitwidth
const bitWidth = Math.ceil(Math.log2(Math.max(...values) + 1))
// TODO: Try both RLE and bit-packed and choose the best
writeBitPacked(writer, values, bitWidth)
}
/**
* @param {Writer} writer
* @param {number[]} values
* @param {number} bitWidth
*/
function writeBitPacked(writer, values, bitWidth) {
// Number of 8-value groups
const numGroups = Math.ceil(values.length / 8)
// The parquet bitpack header: lower bit = 1 => "bit-packed mode"
// upper bits = number of groups
const header = numGroups << 1 | 1
// Write the header as a varint
writer.appendVarInt(header)
// If bitWidth = 0, no data is actually needed
if (bitWidth === 0 || values.length === 0) {
return
}
const mask = (1 << bitWidth) - 1
let buffer = 0 // accumulates bits
let bitsUsed = 0 // how many bits are in 'buffer' so far
// Write out each value, bit-packing into buffer
for (let i = 0; i < values.length; i++) {
const v = values[i] & mask // mask off bits exceeding bitWidth
buffer |= v << bitsUsed
bitsUsed += bitWidth
// Flush full bytes
while (bitsUsed >= 8) {
writer.appendUint8(buffer & 0xFF)
buffer >>>= 8
bitsUsed -= 8
}
}
// Pad the final partial group with zeros if needed
const totalNeeded = numGroups * 8
for (let padCount = values.length; padCount < totalNeeded; padCount++) {
// Just write a 0 value into the buffer
buffer |= 0 << bitsUsed
bitsUsed += bitWidth
while (bitsUsed >= 8) {
writer.appendUint8(buffer & 0xFF)
buffer >>>= 8
bitsUsed -= 8
}
}
// Flush any remaining bits
if (bitsUsed > 0) {
writer.appendUint8(buffer & 0xFF)
}
}