hyparquet/src/encoding.js

128 lines
3.7 KiB
JavaScript
Raw Normal View History

2024-01-05 10:16:15 +00:00
import { readVarInt } from './thrift.js'
/**
* Convert the value specified to a bit width.
*
* @param {number} value - value to convert to bitwidth
* @returns {number} bit width of the value
*/
export function widthFromMaxInt(value) {
return Math.ceil(Math.log2(value + 1))
}
/**
* Read values from a run-length encoded/bit-packed hybrid encoding.
2024-01-20 21:52:36 +00:00
*
* If length is zero, then read as int32 at the start of the encoded data.
2024-01-05 10:16:15 +00:00
*
* @typedef {import("./types.d.ts").DataReader} DataReader
2024-05-02 06:23:50 +00:00
* @typedef {number[]} DecodedArray
* @param {DataReader} reader - buffer to read data from
2024-01-05 10:16:15 +00:00
* @param {number} width - width of each bit-packed group
2024-01-20 21:52:36 +00:00
* @param {number} length - length of the encoded data
2024-05-02 06:23:50 +00:00
* @param {DecodedArray} values - output array
2024-01-05 10:16:15 +00:00
*/
2024-04-30 21:40:18 +00:00
export function readRleBitPackedHybrid(reader, width, length, values) {
2024-01-05 10:16:15 +00:00
if (!length) {
2024-05-20 09:53:07 +00:00
length = reader.view.getUint32(reader.offset, true)
reader.offset += 4
2024-01-05 10:16:15 +00:00
}
2024-04-30 19:27:10 +00:00
let seen = 0
2024-05-20 09:53:07 +00:00
while (seen < values.length) {
2024-05-01 07:55:16 +00:00
const header = readVarInt(reader)
2024-05-20 09:53:07 +00:00
if (header & 1) {
// bit-packed
seen = readBitPacked(reader, header, width, values, seen)
} else {
2024-01-05 10:16:15 +00:00
// rle
2024-05-01 00:30:08 +00:00
const count = header >>> 1
readRle(reader, count, width, values, seen)
seen += count
2024-01-05 10:16:15 +00:00
}
}
}
/**
* Read a run-length encoded value.
*
* The count is determined from the header and the width is used to grab the
* value that's repeated. Yields the value repeated count times.
*
* @param {DataReader} reader - buffer to read data from
2024-05-01 00:30:08 +00:00
* @param {number} count - number of values to read
2024-01-05 10:16:15 +00:00
* @param {number} bitWidth - width of each bit-packed group
2024-05-02 06:23:50 +00:00
* @param {DecodedArray} values - output array
2024-05-01 00:30:08 +00:00
* @param {number} seen - number of values seen so far
2024-01-05 10:16:15 +00:00
*/
2024-05-01 00:30:08 +00:00
function readRle(reader, count, bitWidth, values, seen) {
2024-05-19 01:21:18 +00:00
const width = bitWidth + 7 >> 3
2024-05-13 07:15:57 +00:00
let value = 0
2024-01-05 10:16:15 +00:00
if (width === 1) {
2024-04-30 07:09:41 +00:00
value = reader.view.getUint8(reader.offset)
2024-01-05 10:16:15 +00:00
} else if (width === 2) {
2024-04-30 07:09:41 +00:00
value = reader.view.getUint16(reader.offset, true)
2024-01-05 10:16:15 +00:00
} else if (width === 4) {
2024-04-30 07:09:41 +00:00
value = reader.view.getUint32(reader.offset, true)
2024-05-13 07:15:57 +00:00
} else if (width) {
2024-01-13 00:28:37 +00:00
throw new Error(`parquet invalid rle width ${width}`)
2024-01-05 10:16:15 +00:00
}
2024-05-01 00:30:08 +00:00
reader.offset += width
2024-01-05 10:16:15 +00:00
// repeat value count times
for (let i = 0; i < count; i++) {
2024-05-01 00:30:08 +00:00
values[seen + i] = value
2024-01-05 10:16:15 +00:00
}
}
/**
* Read a bit-packed run of the rle/bitpack hybrid.
* Supports width > 8 (crossing bytes).
*
* @param {DataReader} reader - buffer to read data from
2024-01-05 10:16:15 +00:00
* @param {number} header - header information
* @param {number} bitWidth - width of each bit-packed group
2024-05-01 02:59:58 +00:00
* @param {number[]} values - output array
* @param {number} seen - number of values seen so far
* @returns {number} number of values seen
2024-01-05 10:16:15 +00:00
*/
2024-05-01 02:59:58 +00:00
function readBitPacked(reader, header, bitWidth, values, seen) {
2024-01-09 23:15:08 +00:00
// extract number of values to read from header
2024-05-19 01:21:18 +00:00
let count = header >> 1 << 3
2024-04-30 07:09:41 +00:00
// mask for bitWidth number of bits
const mask = (1 << bitWidth) - 1
2024-01-05 10:16:15 +00:00
2024-02-26 18:32:53 +00:00
let data = 0
if (reader.offset < reader.view.byteLength) {
2024-05-03 00:21:17 +00:00
data = reader.view.getUint8(reader.offset++)
2024-02-26 18:32:53 +00:00
} else if (mask) {
2024-05-03 00:21:17 +00:00
// sometimes out-of-bounds reads are masked out
throw new Error(`parquet bitpack offset ${reader.offset} out of range`)
2024-02-26 18:32:53 +00:00
}
2024-01-05 10:16:15 +00:00
let left = 8
let right = 0
2024-01-09 23:15:08 +00:00
// read values
2024-01-05 10:16:15 +00:00
while (count) {
2024-01-09 23:15:08 +00:00
// if we have crossed a byte boundary, shift the data
2024-01-05 10:16:15 +00:00
if (right > 8) {
right -= 8
left -= 8
data >>= 8
} else if (left - right < bitWidth) {
2024-01-09 23:15:08 +00:00
// if we don't have bitWidth number of bits to read, read next byte
data |= reader.view.getUint8(reader.offset) << left
reader.offset++
2024-01-05 10:16:15 +00:00
left += 8
} else {
2024-05-01 02:59:58 +00:00
if (seen < values.length) {
2024-01-09 23:15:08 +00:00
// emit value by shifting off to the right and masking
2024-05-19 01:21:18 +00:00
values[seen++] = data >> right & mask
2024-01-05 10:16:15 +00:00
}
count--
right += bitWidth
}
}
2024-05-01 02:59:58 +00:00
return seen
2024-01-05 10:16:15 +00:00
}