mirror of
https://github.com/asadbek064/hyparquet-writer.git
synced 2025-12-05 23:31:54 +00:00
Choose best of RLE or bit-packed for hybrid encoding
sparse booleans: 2634 -> 1399 bytes
This commit is contained in:
parent
44d0d0c77a
commit
947a78f72d
@ -1,6 +1,6 @@
|
||||
import { Writer } from './writer.js'
|
||||
|
||||
/**
|
||||
* @import {Writer} from './writer.js'
|
||||
* @param {Writer} writer
|
||||
* @param {number[]} values
|
||||
* @returns {number} bytes written
|
||||
@ -10,8 +10,18 @@ export function writeRleBitPackedHybrid(writer, values) {
|
||||
// find max bitwidth
|
||||
const bitWidth = Math.ceil(Math.log2(Math.max(...values) + 1))
|
||||
|
||||
// TODO: Try both RLE and bit-packed and choose the best
|
||||
writeBitPacked(writer, values, bitWidth)
|
||||
// try both RLE and bit-packed and choose the best
|
||||
const rle = new Writer()
|
||||
writeRle(rle, values, bitWidth)
|
||||
const bitPacked = new Writer()
|
||||
writeBitPacked(bitPacked, values, bitWidth)
|
||||
|
||||
if (rle.offset < bitPacked.offset) {
|
||||
writer.appendBuffer(rle.getBuffer())
|
||||
} else {
|
||||
writer.appendBuffer(bitPacked.getBuffer())
|
||||
}
|
||||
|
||||
return writer.offset - offsetStart
|
||||
}
|
||||
|
||||
@ -69,6 +79,41 @@ function writeBitPacked(writer, values, bitWidth) {
|
||||
|
||||
// Flush any remaining bits
|
||||
if (bitsUsed > 0) {
|
||||
writer.appendUint8(buffer & 0xFF)
|
||||
writer.appendUint8(buffer & 0xff)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Run-length encoding: write repeated values by encoding the value and its count.
|
||||
*
|
||||
* @param {Writer} writer
|
||||
* @param {number[]} values
|
||||
* @param {number} bitWidth
|
||||
*/
|
||||
function writeRle(writer, values, bitWidth) {
|
||||
if (!values.length) return
|
||||
|
||||
let currentValue = values[0]
|
||||
let count = 1
|
||||
|
||||
for (let i = 1; i <= values.length; i++) {
|
||||
if (i < values.length && values[i] === currentValue) {
|
||||
count++ // continue the run
|
||||
} else {
|
||||
// write the count of repeated values
|
||||
writer.appendVarInt(count)
|
||||
|
||||
// write the value
|
||||
const width = bitWidth + 7 >> 3 // bytes needed
|
||||
for (let j = 0; j < width; j++) {
|
||||
writer.appendUint8(currentValue >> (j << 3) & 0xff)
|
||||
}
|
||||
|
||||
// reset for the next run
|
||||
if (i < values.length) {
|
||||
currentValue = values[i]
|
||||
count = 1
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -39,4 +39,16 @@ describe('parquetWrite', () => {
|
||||
{ bool: false, int: 0x7fffffff, bigint: 0x7fffffffffffffffn, double: 1e100, string: 'd', nullable: null },
|
||||
])
|
||||
})
|
||||
|
||||
it('efficiently serializes sparse booleans', () => {
|
||||
const bool = Array(10000).fill(null)
|
||||
bool[10] = true
|
||||
bool[100] = false
|
||||
bool[500] = true
|
||||
bool[9999] = false
|
||||
const buffer = parquetWrite({ bool })
|
||||
expect(buffer.byteLength).toBe(1399)
|
||||
const metadata = parquetMetadata(buffer)
|
||||
expect(metadata.metadata_length).toBe(89)
|
||||
})
|
||||
})
|
||||
|
||||
Loading…
Reference in New Issue
Block a user