From 66b832d5bb20a8fd374877c040249456a3414817 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Tue, 21 May 2024 17:29:13 -0700 Subject: [PATCH] Fix RLE encoding length --- src/datapage.js | 1 - src/datapageV2.js | 9 ++++--- src/encoding.js | 56 +++++++++++++++++++++---------------------- test/encoding.test.js | 8 +++---- 4 files changed, 35 insertions(+), 39 deletions(-) diff --git a/src/datapage.js b/src/datapage.js index 22f9f32..93a95a1 100644 --- a/src/datapage.js +++ b/src/datapage.js @@ -36,7 +36,6 @@ export function readDataPage(bytes, daph, schemaPath, { type }) { daph.encoding === 'RLE_DICTIONARY' || daph.encoding === 'RLE' ) { - // TODO: RLE encoding uses bitWidth = schemaElement.type_length const bitWidth = type === 'BOOLEAN' ? 1 : view.getUint8(reader.offset++) if (bitWidth) { dataPage = new Array(nValues) diff --git a/src/datapageV2.js b/src/datapageV2.js index 1e01af6..7c8fb0e 100644 --- a/src/datapageV2.js +++ b/src/datapageV2.js @@ -50,17 +50,16 @@ export function readDataPageV2(compressedBytes, ph, schemaPath, columnMetadata, const { type_length } = schemaPath[schemaPath.length - 1].element dataPage = readPlain(pageReader, type, nValues, type_length) } else if (daph2.encoding === 'RLE') { - pageReader.offset = 4 + // assert(columnMetadata.type === 'BOOLEAN') dataPage = new Array(nValues) - readRleBitPackedHybrid(pageReader, 1, uncompressedPageSize, dataPage) + readRleBitPackedHybrid(pageReader, 1, 0, dataPage) } else if ( daph2.encoding === 'PLAIN_DICTIONARY' || daph2.encoding === 'RLE_DICTIONARY' ) { - const bitWidth = pageView.getUint8(0) - pageReader.offset = 1 + const bitWidth = pageView.getUint8(pageReader.offset++) dataPage = new Array(nValues) - readRleBitPackedHybrid(pageReader, bitWidth, uncompressedPageSize, dataPage) + readRleBitPackedHybrid(pageReader, bitWidth, uncompressedPageSize - 1, dataPage) } else if (daph2.encoding === 'DELTA_BINARY_PACKED') { const int32 = type === 'INT32' dataPage = int32 ? new Int32Array(nValues) : new BigInt64Array(nValues) diff --git a/src/encoding.js b/src/encoding.js index 0e9037a..82d328f 100644 --- a/src/encoding.js +++ b/src/encoding.js @@ -1,10 +1,10 @@ import { readVarInt } from './thrift.js' /** - * Convert the value specified to a bit width. + * Minimum bits needed to store value. * - * @param {number} value - value to convert to bitwidth - * @returns {number} bit width of the value + * @param {number} value + * @returns {number} */ export function widthFromMaxInt(value) { return Math.ceil(Math.log2(value + 1)) @@ -20,41 +20,39 @@ export function widthFromMaxInt(value) { * @param {DataReader} reader - buffer to read data from * @param {number} width - width of each bit-packed group * @param {number} length - length of the encoded data - * @param {DecodedArray} values - output array + * @param {DecodedArray} output */ -export function readRleBitPackedHybrid(reader, width, length, values) { +export function readRleBitPackedHybrid(reader, width, length, output) { if (!length) { - length = reader.view.getUint32(reader.offset, true) + // length = reader.view.getUint32(reader.offset, true) reader.offset += 4 } let seen = 0 - while (seen < values.length) { + while (seen < output.length) { const header = readVarInt(reader) if (header & 1) { // bit-packed - seen = readBitPacked(reader, header, width, values, seen) + seen = readBitPacked(reader, header, width, output, seen) } else { // rle const count = header >>> 1 - readRle(reader, count, width, values, seen) + readRle(reader, count, width, output, seen) seen += count } } + // assert(reader.offset - startOffset === length) } /** - * Read a run-length encoded value. + * Run-length encoding: read value with bitWidth and repeat it count times. * - * The count is determined from the header and the width is used to grab the - * value that's repeated. Yields the value repeated count times. - * - * @param {DataReader} reader - buffer to read data from - * @param {number} count - number of values to read - * @param {number} bitWidth - width of each bit-packed group - * @param {DecodedArray} values - output array - * @param {number} seen - number of values seen so far + * @param {DataReader} reader + * @param {number} count + * @param {number} bitWidth + * @param {DecodedArray} output + * @param {number} seen */ -function readRle(reader, count, bitWidth, values, seen) { +function readRle(reader, count, bitWidth, output, seen) { const width = bitWidth + 7 >> 3 let value = 0 if (width === 1) { @@ -70,7 +68,7 @@ function readRle(reader, count, bitWidth, values, seen) { // repeat value count times for (let i = 0; i < count; i++) { - values[seen + i] = value + output[seen + i] = value } } @@ -78,14 +76,14 @@ function readRle(reader, count, bitWidth, values, seen) { * Read a bit-packed run of the rle/bitpack hybrid. * Supports width > 8 (crossing bytes). * - * @param {DataReader} reader - buffer to read data from - * @param {number} header - header information - * @param {number} bitWidth - width of each bit-packed group - * @param {DecodedArray} values - output array - * @param {number} seen - number of values seen so far - * @returns {number} number of values seen + * @param {DataReader} reader + * @param {number} header - bit-pack header + * @param {number} bitWidth + * @param {DecodedArray} output + * @param {number} seen + * @returns {number} total output values so far */ -function readBitPacked(reader, header, bitWidth, values, seen) { +function readBitPacked(reader, header, bitWidth, output, seen) { let count = header >> 1 << 3 // values to read const mask = (1 << bitWidth) - 1 @@ -112,9 +110,9 @@ function readBitPacked(reader, header, bitWidth, values, seen) { reader.offset++ left += 8 } else { - if (seen < values.length) { + if (seen < output.length) { // emit value - values[seen++] = data >> right & mask + output[seen++] = data >> right & mask } count-- right += bitWidth diff --git a/test/encoding.test.js b/test/encoding.test.js index d05b3db..3c459e1 100644 --- a/test/encoding.test.js +++ b/test/encoding.test.js @@ -14,7 +14,7 @@ describe('readRleBitPackedHybrid', () => { const reader = { view, offset: 0 } const values = new Array(6) - readRleBitPackedHybrid(reader, 1, 6, values) + readRleBitPackedHybrid(reader, 1, 4, values) expect(reader.offset).toBe(4) expect(values).toEqual([1, 1, 1, 100, 100, 100]) }) @@ -42,7 +42,7 @@ describe('readRleBitPackedHybrid', () => { const reader = { view, offset: 0 } const values = new Array(3) - readRleBitPackedHybrid(reader, 32, 3, values) + readRleBitPackedHybrid(reader, 32, 5, values) expect(reader.offset).toBe(5) expect(values).toEqual([234000, 234000, 234000]) }) @@ -62,7 +62,7 @@ describe('readRleBitPackedHybrid', () => { // Bit-packed values: false, false, true const buffer = new ArrayBuffer(8) const view = new DataView(buffer) - view.setInt32(0, 3, true) // length 3 little-endian + view.setInt32(0, 2, true) // length 2 little-endian view.setUint8(4, 0b00000011) // Bit-packed header for 1-8 values view.setUint8(5, 0b00000100) // Bit-packed values (false, false, true) const reader = { view, offset: 0 } @@ -83,7 +83,7 @@ describe('readRleBitPackedHybrid', () => { const reader = { view, offset: 0 } const values = new Array(9) - readRleBitPackedHybrid(reader, 1, 9, values) + readRleBitPackedHybrid(reader, 1, 3, values) expect(reader.offset).toBe(3) expect(values).toEqual([1, 1, 1, 1, 1, 1, 1, 1, 1]) })