2024-01-05 10:16:15 +00:00
|
|
|
import { readVarInt } from './thrift.js'
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Convert the value specified to a bit width.
|
|
|
|
|
*
|
|
|
|
|
* @param {number} value - value to convert to bitwidth
|
|
|
|
|
* @returns {number} bit width of the value
|
|
|
|
|
*/
|
|
|
|
|
export function widthFromMaxInt(value) {
|
|
|
|
|
return Math.ceil(Math.log2(value + 1))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Read values from a run-length encoded/bit-packed hybrid encoding.
|
2024-01-20 21:52:36 +00:00
|
|
|
*
|
|
|
|
|
* If length is zero, then read as int32 at the start of the encoded data.
|
2024-01-05 10:16:15 +00:00
|
|
|
*
|
2024-04-17 07:48:33 +00:00
|
|
|
* @typedef {import("./types.d.ts").DataReader} DataReader
|
2024-05-02 06:23:50 +00:00
|
|
|
* @typedef {number[]} DecodedArray
|
2024-04-17 07:48:33 +00:00
|
|
|
* @param {DataReader} reader - buffer to read data from
|
2024-01-05 10:16:15 +00:00
|
|
|
* @param {number} width - width of each bit-packed group
|
2024-01-20 21:52:36 +00:00
|
|
|
* @param {number} length - length of the encoded data
|
2024-05-02 06:23:50 +00:00
|
|
|
* @param {DecodedArray} values - output array
|
2024-01-05 10:16:15 +00:00
|
|
|
*/
|
2024-04-30 21:40:18 +00:00
|
|
|
export function readRleBitPackedHybrid(reader, width, length, values) {
|
2024-01-05 10:16:15 +00:00
|
|
|
if (!length) {
|
2024-05-20 09:53:07 +00:00
|
|
|
length = reader.view.getUint32(reader.offset, true)
|
2024-04-17 07:48:33 +00:00
|
|
|
reader.offset += 4
|
2024-01-05 10:16:15 +00:00
|
|
|
}
|
2024-04-30 19:27:10 +00:00
|
|
|
let seen = 0
|
2024-05-20 09:53:07 +00:00
|
|
|
while (seen < values.length) {
|
2024-05-01 07:55:16 +00:00
|
|
|
const header = readVarInt(reader)
|
2024-05-20 09:53:07 +00:00
|
|
|
if (header & 1) {
|
|
|
|
|
// bit-packed
|
|
|
|
|
seen = readBitPacked(reader, header, width, values, seen)
|
|
|
|
|
} else {
|
2024-01-05 10:16:15 +00:00
|
|
|
// rle
|
2024-05-01 00:30:08 +00:00
|
|
|
const count = header >>> 1
|
|
|
|
|
readRle(reader, count, width, values, seen)
|
|
|
|
|
seen += count
|
2024-01-05 10:16:15 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Read a run-length encoded value.
|
|
|
|
|
*
|
|
|
|
|
* The count is determined from the header and the width is used to grab the
|
|
|
|
|
* value that's repeated. Yields the value repeated count times.
|
|
|
|
|
*
|
2024-04-17 07:48:33 +00:00
|
|
|
* @param {DataReader} reader - buffer to read data from
|
2024-05-01 00:30:08 +00:00
|
|
|
* @param {number} count - number of values to read
|
2024-01-05 10:16:15 +00:00
|
|
|
* @param {number} bitWidth - width of each bit-packed group
|
2024-05-02 06:23:50 +00:00
|
|
|
* @param {DecodedArray} values - output array
|
2024-05-01 00:30:08 +00:00
|
|
|
* @param {number} seen - number of values seen so far
|
2024-01-05 10:16:15 +00:00
|
|
|
*/
|
2024-05-01 00:30:08 +00:00
|
|
|
function readRle(reader, count, bitWidth, values, seen) {
|
2024-05-19 01:21:18 +00:00
|
|
|
const width = bitWidth + 7 >> 3
|
2024-05-13 07:15:57 +00:00
|
|
|
let value = 0
|
2024-01-05 10:16:15 +00:00
|
|
|
if (width === 1) {
|
2024-04-30 07:09:41 +00:00
|
|
|
value = reader.view.getUint8(reader.offset)
|
2024-01-05 10:16:15 +00:00
|
|
|
} else if (width === 2) {
|
2024-04-30 07:09:41 +00:00
|
|
|
value = reader.view.getUint16(reader.offset, true)
|
2024-01-05 10:16:15 +00:00
|
|
|
} else if (width === 4) {
|
2024-04-30 07:09:41 +00:00
|
|
|
value = reader.view.getUint32(reader.offset, true)
|
2024-05-13 07:15:57 +00:00
|
|
|
} else if (width) {
|
2024-01-13 00:28:37 +00:00
|
|
|
throw new Error(`parquet invalid rle width ${width}`)
|
2024-01-05 10:16:15 +00:00
|
|
|
}
|
2024-05-01 00:30:08 +00:00
|
|
|
reader.offset += width
|
2024-01-05 10:16:15 +00:00
|
|
|
|
|
|
|
|
// repeat value count times
|
|
|
|
|
for (let i = 0; i < count; i++) {
|
2024-05-01 00:30:08 +00:00
|
|
|
values[seen + i] = value
|
2024-01-05 10:16:15 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Read a bit-packed run of the rle/bitpack hybrid.
|
|
|
|
|
* Supports width > 8 (crossing bytes).
|
|
|
|
|
*
|
2024-04-17 07:48:33 +00:00
|
|
|
* @param {DataReader} reader - buffer to read data from
|
2024-01-05 10:16:15 +00:00
|
|
|
* @param {number} header - header information
|
|
|
|
|
* @param {number} bitWidth - width of each bit-packed group
|
2024-05-01 02:59:58 +00:00
|
|
|
* @param {number[]} values - output array
|
|
|
|
|
* @param {number} seen - number of values seen so far
|
|
|
|
|
* @returns {number} number of values seen
|
2024-01-05 10:16:15 +00:00
|
|
|
*/
|
2024-05-01 02:59:58 +00:00
|
|
|
function readBitPacked(reader, header, bitWidth, values, seen) {
|
2024-01-09 23:15:08 +00:00
|
|
|
// extract number of values to read from header
|
2024-05-19 01:21:18 +00:00
|
|
|
let count = header >> 1 << 3
|
2024-04-30 07:09:41 +00:00
|
|
|
// mask for bitWidth number of bits
|
|
|
|
|
const mask = (1 << bitWidth) - 1
|
2024-01-05 10:16:15 +00:00
|
|
|
|
2024-02-26 18:32:53 +00:00
|
|
|
let data = 0
|
2024-04-17 07:48:33 +00:00
|
|
|
if (reader.offset < reader.view.byteLength) {
|
2024-05-03 00:21:17 +00:00
|
|
|
data = reader.view.getUint8(reader.offset++)
|
2024-02-26 18:32:53 +00:00
|
|
|
} else if (mask) {
|
2024-05-03 00:21:17 +00:00
|
|
|
// sometimes out-of-bounds reads are masked out
|
2024-04-17 07:48:33 +00:00
|
|
|
throw new Error(`parquet bitpack offset ${reader.offset} out of range`)
|
2024-02-26 18:32:53 +00:00
|
|
|
}
|
2024-01-05 10:16:15 +00:00
|
|
|
let left = 8
|
|
|
|
|
let right = 0
|
|
|
|
|
|
2024-01-09 23:15:08 +00:00
|
|
|
// read values
|
2024-01-05 10:16:15 +00:00
|
|
|
while (count) {
|
2024-01-09 23:15:08 +00:00
|
|
|
// if we have crossed a byte boundary, shift the data
|
2024-01-05 10:16:15 +00:00
|
|
|
if (right > 8) {
|
|
|
|
|
right -= 8
|
|
|
|
|
left -= 8
|
|
|
|
|
data >>= 8
|
|
|
|
|
} else if (left - right < bitWidth) {
|
2024-01-09 23:15:08 +00:00
|
|
|
// if we don't have bitWidth number of bits to read, read next byte
|
2024-04-17 07:48:33 +00:00
|
|
|
data |= reader.view.getUint8(reader.offset) << left
|
|
|
|
|
reader.offset++
|
2024-01-05 10:16:15 +00:00
|
|
|
left += 8
|
|
|
|
|
} else {
|
2024-05-01 02:59:58 +00:00
|
|
|
if (seen < values.length) {
|
2024-01-09 23:15:08 +00:00
|
|
|
// emit value by shifting off to the right and masking
|
2024-05-19 01:21:18 +00:00
|
|
|
values[seen++] = data >> right & mask
|
2024-01-05 10:16:15 +00:00
|
|
|
}
|
|
|
|
|
count--
|
|
|
|
|
right += bitWidth
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-05-01 02:59:58 +00:00
|
|
|
return seen
|
2024-01-05 10:16:15 +00:00
|
|
|
}
|
2024-05-21 06:09:31 +00:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param {DataReader} reader
|
|
|
|
|
* @param {number} nValues
|
|
|
|
|
* @param {Float32Array | Float64Array} output
|
|
|
|
|
*/
|
|
|
|
|
export function byteStreamSplit(reader, nValues, output) {
|
|
|
|
|
const byteWidth = output instanceof Float32Array ? 4 : 8
|
|
|
|
|
const bytes = new Uint8Array(output.buffer)
|
|
|
|
|
for (let b = 0; b < byteWidth; b++) {
|
|
|
|
|
for (let i = 0; i < nValues; i++) {
|
|
|
|
|
bytes[i * byteWidth + b] = reader.view.getUint8(reader.offset++)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|