2024-01-05 10:16:15 +00:00
|
|
|
import { readVarInt } from './thrift.js'
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Convert the value specified to a bit width.
|
|
|
|
|
*
|
|
|
|
|
* @param {number} value - value to convert to bitwidth
|
|
|
|
|
* @returns {number} bit width of the value
|
|
|
|
|
*/
|
|
|
|
|
export function widthFromMaxInt(value) {
|
|
|
|
|
return Math.ceil(Math.log2(value + 1))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Read values from a run-length encoded/bit-packed hybrid encoding.
|
2024-01-20 21:52:36 +00:00
|
|
|
*
|
|
|
|
|
* If length is zero, then read as int32 at the start of the encoded data.
|
2024-01-05 10:16:15 +00:00
|
|
|
*
|
2024-04-17 07:48:33 +00:00
|
|
|
* @typedef {import("./types.d.ts").DataReader} DataReader
|
|
|
|
|
* @param {DataReader} reader - buffer to read data from
|
2024-01-05 10:16:15 +00:00
|
|
|
* @param {number} width - width of each bit-packed group
|
2024-01-20 21:52:36 +00:00
|
|
|
* @param {number} length - length of the encoded data
|
2024-04-30 21:40:18 +00:00
|
|
|
* @param {number[]} values - output array
|
2024-01-05 10:16:15 +00:00
|
|
|
*/
|
2024-04-30 21:40:18 +00:00
|
|
|
export function readRleBitPackedHybrid(reader, width, length, values) {
|
2024-01-05 10:16:15 +00:00
|
|
|
if (!length) {
|
2024-04-17 07:48:33 +00:00
|
|
|
length = reader.view.getInt32(reader.offset, true)
|
|
|
|
|
reader.offset += 4
|
2024-01-13 00:28:37 +00:00
|
|
|
if (length < 0) throw new Error(`parquet invalid rle/bitpack length ${length}`)
|
2024-01-05 10:16:15 +00:00
|
|
|
}
|
2024-04-30 19:27:10 +00:00
|
|
|
let seen = 0
|
2024-04-17 07:48:33 +00:00
|
|
|
const startOffset = reader.offset
|
2024-04-30 21:40:18 +00:00
|
|
|
while (reader.offset - startOffset < length && seen < values.length) {
|
2024-04-17 07:48:33 +00:00
|
|
|
const [header, newOffset] = readVarInt(reader.view, reader.offset)
|
|
|
|
|
reader.offset = newOffset
|
2024-01-05 10:16:15 +00:00
|
|
|
if ((header & 1) === 0) {
|
|
|
|
|
// rle
|
2024-05-01 00:30:08 +00:00
|
|
|
const count = header >>> 1
|
|
|
|
|
readRle(reader, count, width, values, seen)
|
|
|
|
|
seen += count
|
2024-01-05 10:16:15 +00:00
|
|
|
} else {
|
|
|
|
|
// bit-packed
|
2024-05-01 02:59:58 +00:00
|
|
|
seen = readBitPacked(reader, header, width, values, seen)
|
2024-01-05 10:16:15 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Read a run-length encoded value.
|
|
|
|
|
*
|
|
|
|
|
* The count is determined from the header and the width is used to grab the
|
|
|
|
|
* value that's repeated. Yields the value repeated count times.
|
|
|
|
|
*
|
2024-04-17 07:48:33 +00:00
|
|
|
* @param {DataReader} reader - buffer to read data from
|
2024-05-01 00:30:08 +00:00
|
|
|
* @param {number} count - number of values to read
|
2024-01-05 10:16:15 +00:00
|
|
|
* @param {number} bitWidth - width of each bit-packed group
|
2024-05-01 00:30:08 +00:00
|
|
|
* @param {number[]} values - output array
|
|
|
|
|
* @param {number} seen - number of values seen so far
|
2024-01-05 10:16:15 +00:00
|
|
|
*/
|
2024-05-01 00:30:08 +00:00
|
|
|
function readRle(reader, count, bitWidth, values, seen) {
|
2024-01-05 10:16:15 +00:00
|
|
|
const width = (bitWidth + 7) >> 3
|
2024-04-30 07:09:41 +00:00
|
|
|
let value
|
2024-01-05 10:16:15 +00:00
|
|
|
if (width === 1) {
|
2024-04-30 07:09:41 +00:00
|
|
|
value = reader.view.getUint8(reader.offset)
|
2024-01-05 10:16:15 +00:00
|
|
|
} else if (width === 2) {
|
2024-04-30 07:09:41 +00:00
|
|
|
value = reader.view.getUint16(reader.offset, true)
|
2024-01-05 10:16:15 +00:00
|
|
|
} else if (width === 4) {
|
2024-04-30 07:09:41 +00:00
|
|
|
value = reader.view.getUint32(reader.offset, true)
|
2024-01-05 10:16:15 +00:00
|
|
|
} else {
|
2024-01-13 00:28:37 +00:00
|
|
|
throw new Error(`parquet invalid rle width ${width}`)
|
2024-01-05 10:16:15 +00:00
|
|
|
}
|
2024-05-01 00:30:08 +00:00
|
|
|
reader.offset += width
|
2024-01-05 10:16:15 +00:00
|
|
|
|
|
|
|
|
// repeat value count times
|
|
|
|
|
for (let i = 0; i < count; i++) {
|
2024-05-01 00:30:08 +00:00
|
|
|
values[seen + i] = value
|
2024-01-05 10:16:15 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Read a bit-packed run of the rle/bitpack hybrid.
|
|
|
|
|
* Supports width > 8 (crossing bytes).
|
|
|
|
|
*
|
2024-04-17 07:48:33 +00:00
|
|
|
* @param {DataReader} reader - buffer to read data from
|
2024-01-05 10:16:15 +00:00
|
|
|
* @param {number} header - header information
|
|
|
|
|
* @param {number} bitWidth - width of each bit-packed group
|
2024-05-01 02:59:58 +00:00
|
|
|
* @param {number[]} values - output array
|
|
|
|
|
* @param {number} seen - number of values seen so far
|
|
|
|
|
* @returns {number} number of values seen
|
2024-01-05 10:16:15 +00:00
|
|
|
*/
|
2024-05-01 02:59:58 +00:00
|
|
|
function readBitPacked(reader, header, bitWidth, values, seen) {
|
2024-01-09 23:15:08 +00:00
|
|
|
// extract number of values to read from header
|
|
|
|
|
let count = (header >> 1) << 3
|
2024-04-30 07:09:41 +00:00
|
|
|
// mask for bitWidth number of bits
|
|
|
|
|
const mask = (1 << bitWidth) - 1
|
2024-01-05 10:16:15 +00:00
|
|
|
|
2024-02-26 18:32:53 +00:00
|
|
|
// Sometimes it tries to read outside of available memory, but it will be masked out anyway
|
|
|
|
|
let data = 0
|
2024-04-17 07:48:33 +00:00
|
|
|
if (reader.offset < reader.view.byteLength) {
|
|
|
|
|
data = reader.view.getUint8(reader.offset)
|
|
|
|
|
reader.offset++
|
2024-02-26 18:32:53 +00:00
|
|
|
} else if (mask) {
|
2024-04-17 07:48:33 +00:00
|
|
|
throw new Error(`parquet bitpack offset ${reader.offset} out of range`)
|
2024-02-26 18:32:53 +00:00
|
|
|
}
|
2024-01-05 10:16:15 +00:00
|
|
|
let left = 8
|
|
|
|
|
let right = 0
|
|
|
|
|
|
2024-01-09 23:15:08 +00:00
|
|
|
// read values
|
2024-01-05 10:16:15 +00:00
|
|
|
while (count) {
|
2024-01-09 23:15:08 +00:00
|
|
|
// if we have crossed a byte boundary, shift the data
|
2024-01-05 10:16:15 +00:00
|
|
|
if (right > 8) {
|
|
|
|
|
right -= 8
|
|
|
|
|
left -= 8
|
|
|
|
|
data >>= 8
|
|
|
|
|
} else if (left - right < bitWidth) {
|
2024-01-09 23:15:08 +00:00
|
|
|
// if we don't have bitWidth number of bits to read, read next byte
|
2024-04-17 07:48:33 +00:00
|
|
|
data |= reader.view.getUint8(reader.offset) << left
|
|
|
|
|
reader.offset++
|
2024-01-05 10:16:15 +00:00
|
|
|
left += 8
|
|
|
|
|
} else {
|
2024-05-01 02:59:58 +00:00
|
|
|
if (seen < values.length) {
|
2024-01-09 23:15:08 +00:00
|
|
|
// emit value by shifting off to the right and masking
|
2024-05-01 02:59:58 +00:00
|
|
|
values[seen++] = (data >> right) & mask
|
2024-01-05 10:16:15 +00:00
|
|
|
}
|
|
|
|
|
count--
|
|
|
|
|
right += bitWidth
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-05-01 02:59:58 +00:00
|
|
|
return seen
|
2024-01-05 10:16:15 +00:00
|
|
|
}
|