Fix RLE encoding length

This commit is contained in:
Kenny Daniel 2024-05-21 17:29:13 -07:00
parent a1ca1ef785
commit 66b832d5bb
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
4 changed files with 35 additions and 39 deletions

@ -36,7 +36,6 @@ export function readDataPage(bytes, daph, schemaPath, { type }) {
daph.encoding === 'RLE_DICTIONARY' ||
daph.encoding === 'RLE'
) {
// TODO: RLE encoding uses bitWidth = schemaElement.type_length
const bitWidth = type === 'BOOLEAN' ? 1 : view.getUint8(reader.offset++)
if (bitWidth) {
dataPage = new Array(nValues)

@ -50,17 +50,16 @@ export function readDataPageV2(compressedBytes, ph, schemaPath, columnMetadata,
const { type_length } = schemaPath[schemaPath.length - 1].element
dataPage = readPlain(pageReader, type, nValues, type_length)
} else if (daph2.encoding === 'RLE') {
pageReader.offset = 4
// assert(columnMetadata.type === 'BOOLEAN')
dataPage = new Array(nValues)
readRleBitPackedHybrid(pageReader, 1, uncompressedPageSize, dataPage)
readRleBitPackedHybrid(pageReader, 1, 0, dataPage)
} else if (
daph2.encoding === 'PLAIN_DICTIONARY' ||
daph2.encoding === 'RLE_DICTIONARY'
) {
const bitWidth = pageView.getUint8(0)
pageReader.offset = 1
const bitWidth = pageView.getUint8(pageReader.offset++)
dataPage = new Array(nValues)
readRleBitPackedHybrid(pageReader, bitWidth, uncompressedPageSize, dataPage)
readRleBitPackedHybrid(pageReader, bitWidth, uncompressedPageSize - 1, dataPage)
} else if (daph2.encoding === 'DELTA_BINARY_PACKED') {
const int32 = type === 'INT32'
dataPage = int32 ? new Int32Array(nValues) : new BigInt64Array(nValues)

@ -1,10 +1,10 @@
import { readVarInt } from './thrift.js'
/**
* Convert the value specified to a bit width.
* Minimum bits needed to store value.
*
* @param {number} value - value to convert to bitwidth
* @returns {number} bit width of the value
* @param {number} value
* @returns {number}
*/
export function widthFromMaxInt(value) {
return Math.ceil(Math.log2(value + 1))
@ -20,41 +20,39 @@ export function widthFromMaxInt(value) {
* @param {DataReader} reader - buffer to read data from
* @param {number} width - width of each bit-packed group
* @param {number} length - length of the encoded data
* @param {DecodedArray} values - output array
* @param {DecodedArray} output
*/
export function readRleBitPackedHybrid(reader, width, length, values) {
export function readRleBitPackedHybrid(reader, width, length, output) {
if (!length) {
length = reader.view.getUint32(reader.offset, true)
// length = reader.view.getUint32(reader.offset, true)
reader.offset += 4
}
let seen = 0
while (seen < values.length) {
while (seen < output.length) {
const header = readVarInt(reader)
if (header & 1) {
// bit-packed
seen = readBitPacked(reader, header, width, values, seen)
seen = readBitPacked(reader, header, width, output, seen)
} else {
// rle
const count = header >>> 1
readRle(reader, count, width, values, seen)
readRle(reader, count, width, output, seen)
seen += count
}
}
// assert(reader.offset - startOffset === length)
}
/**
* Read a run-length encoded value.
* Run-length encoding: read value with bitWidth and repeat it count times.
*
* The count is determined from the header and the width is used to grab the
* value that's repeated. Yields the value repeated count times.
*
* @param {DataReader} reader - buffer to read data from
* @param {number} count - number of values to read
* @param {number} bitWidth - width of each bit-packed group
* @param {DecodedArray} values - output array
* @param {number} seen - number of values seen so far
* @param {DataReader} reader
* @param {number} count
* @param {number} bitWidth
* @param {DecodedArray} output
* @param {number} seen
*/
function readRle(reader, count, bitWidth, values, seen) {
function readRle(reader, count, bitWidth, output, seen) {
const width = bitWidth + 7 >> 3
let value = 0
if (width === 1) {
@ -70,7 +68,7 @@ function readRle(reader, count, bitWidth, values, seen) {
// repeat value count times
for (let i = 0; i < count; i++) {
values[seen + i] = value
output[seen + i] = value
}
}
@ -78,14 +76,14 @@ function readRle(reader, count, bitWidth, values, seen) {
* Read a bit-packed run of the rle/bitpack hybrid.
* Supports width > 8 (crossing bytes).
*
* @param {DataReader} reader - buffer to read data from
* @param {number} header - header information
* @param {number} bitWidth - width of each bit-packed group
* @param {DecodedArray} values - output array
* @param {number} seen - number of values seen so far
* @returns {number} number of values seen
* @param {DataReader} reader
* @param {number} header - bit-pack header
* @param {number} bitWidth
* @param {DecodedArray} output
* @param {number} seen
* @returns {number} total output values so far
*/
function readBitPacked(reader, header, bitWidth, values, seen) {
function readBitPacked(reader, header, bitWidth, output, seen) {
let count = header >> 1 << 3 // values to read
const mask = (1 << bitWidth) - 1
@ -112,9 +110,9 @@ function readBitPacked(reader, header, bitWidth, values, seen) {
reader.offset++
left += 8
} else {
if (seen < values.length) {
if (seen < output.length) {
// emit value
values[seen++] = data >> right & mask
output[seen++] = data >> right & mask
}
count--
right += bitWidth

@ -14,7 +14,7 @@ describe('readRleBitPackedHybrid', () => {
const reader = { view, offset: 0 }
const values = new Array(6)
readRleBitPackedHybrid(reader, 1, 6, values)
readRleBitPackedHybrid(reader, 1, 4, values)
expect(reader.offset).toBe(4)
expect(values).toEqual([1, 1, 1, 100, 100, 100])
})
@ -42,7 +42,7 @@ describe('readRleBitPackedHybrid', () => {
const reader = { view, offset: 0 }
const values = new Array(3)
readRleBitPackedHybrid(reader, 32, 3, values)
readRleBitPackedHybrid(reader, 32, 5, values)
expect(reader.offset).toBe(5)
expect(values).toEqual([234000, 234000, 234000])
})
@ -62,7 +62,7 @@ describe('readRleBitPackedHybrid', () => {
// Bit-packed values: false, false, true
const buffer = new ArrayBuffer(8)
const view = new DataView(buffer)
view.setInt32(0, 3, true) // length 3 little-endian
view.setInt32(0, 2, true) // length 2 little-endian
view.setUint8(4, 0b00000011) // Bit-packed header for 1-8 values
view.setUint8(5, 0b00000100) // Bit-packed values (false, false, true)
const reader = { view, offset: 0 }
@ -83,7 +83,7 @@ describe('readRleBitPackedHybrid', () => {
const reader = { view, offset: 0 }
const values = new Array(9)
readRleBitPackedHybrid(reader, 1, 9, values)
readRleBitPackedHybrid(reader, 1, 3, values)
expect(reader.offset).toBe(3)
expect(values).toEqual([1, 1, 1, 1, 1, 1, 1, 1, 1])
})