diff --git a/src/encoding.js b/src/encoding.js index 3909397..ad20a94 100644 --- a/src/encoding.js +++ b/src/encoding.js @@ -1,6 +1,6 @@ +import { Writer } from './writer.js' /** - * @import {Writer} from './writer.js' * @param {Writer} writer * @param {number[]} values * @returns {number} bytes written @@ -10,8 +10,18 @@ export function writeRleBitPackedHybrid(writer, values) { // find max bitwidth const bitWidth = Math.ceil(Math.log2(Math.max(...values) + 1)) - // TODO: Try both RLE and bit-packed and choose the best - writeBitPacked(writer, values, bitWidth) + // try both RLE and bit-packed and choose the best + const rle = new Writer() + writeRle(rle, values, bitWidth) + const bitPacked = new Writer() + writeBitPacked(bitPacked, values, bitWidth) + + if (rle.offset < bitPacked.offset) { + writer.appendBuffer(rle.getBuffer()) + } else { + writer.appendBuffer(bitPacked.getBuffer()) + } + return writer.offset - offsetStart } @@ -69,6 +79,41 @@ function writeBitPacked(writer, values, bitWidth) { // Flush any remaining bits if (bitsUsed > 0) { - writer.appendUint8(buffer & 0xFF) + writer.appendUint8(buffer & 0xff) + } +} + +/** + * Run-length encoding: write repeated values by encoding the value and its count. + * + * @param {Writer} writer + * @param {number[]} values + * @param {number} bitWidth + */ +function writeRle(writer, values, bitWidth) { + if (!values.length) return + + let currentValue = values[0] + let count = 1 + + for (let i = 1; i <= values.length; i++) { + if (i < values.length && values[i] === currentValue) { + count++ // continue the run + } else { + // write the count of repeated values + writer.appendVarInt(count) + + // write the value + const width = bitWidth + 7 >> 3 // bytes needed + for (let j = 0; j < width; j++) { + writer.appendUint8(currentValue >> (j << 3) & 0xff) + } + + // reset for the next run + if (i < values.length) { + currentValue = values[i] + count = 1 + } + } } } diff --git a/test/write.test.js b/test/write.test.js index a031a83..59d98ee 100644 --- a/test/write.test.js +++ b/test/write.test.js @@ -39,4 +39,16 @@ describe('parquetWrite', () => { { bool: false, int: 0x7fffffff, bigint: 0x7fffffffffffffffn, double: 1e100, string: 'd', nullable: null }, ]) }) + + it('efficiently serializes sparse booleans', () => { + const bool = Array(10000).fill(null) + bool[10] = true + bool[100] = false + bool[500] = true + bool[9999] = false + const buffer = parquetWrite({ bool }) + expect(buffer.byteLength).toBe(1399) + const metadata = parquetMetadata(buffer) + expect(metadata.metadata_length).toBe(89) + }) })