hyparquet/src/thrift.js

274 lines
6.9 KiB
JavaScript
Raw Normal View History

2023-12-29 19:17:58 +00:00
// TCompactProtocol types
const CompactType = {
STOP: 0,
TRUE: 1,
FALSE: 2,
BYTE: 3,
I16: 4,
I32: 5,
I64: 6,
DOUBLE: 7,
BINARY: 8,
LIST: 9,
SET: 10,
MAP: 11,
STRUCT: 12,
UUID: 13,
}
/**
* Parse TCompactProtocol
2024-01-04 17:50:42 +00:00
*
2024-01-04 18:25:45 +00:00
* @typedef {import("./types.js").Decoded<T>} Decoded
* @template T
2024-01-04 17:50:42 +00:00
* @param {ArrayBuffer} arrayBuffer
* @returns {Decoded<Record<string, any>>}
2023-12-29 19:17:58 +00:00
*/
2024-01-04 18:25:45 +00:00
export function deserializeTCompactProtocol(arrayBuffer) {
2024-01-04 17:50:42 +00:00
const view = new DataView(arrayBuffer)
2024-01-03 01:16:33 +00:00
let byteLength = 0
2023-12-29 19:17:58 +00:00
let lastFid = 0
2024-01-04 18:06:50 +00:00
/** @type {Record<string, any>} */
2024-01-04 18:25:45 +00:00
const value = {}
2023-12-29 19:17:58 +00:00
2024-01-04 17:50:42 +00:00
while (byteLength < arrayBuffer.byteLength) {
2023-12-29 19:17:58 +00:00
// Parse each field based on its type and add to the result object
2024-01-03 01:16:33 +00:00
const [type, fid, newIndex, newLastFid] = readFieldBegin(view, byteLength, lastFid)
byteLength = newIndex
2023-12-29 19:17:58 +00:00
lastFid = newLastFid
if (type === CompactType.STOP) {
break
}
// Handle the field based on its type
let fieldValue
2024-01-03 01:16:33 +00:00
[fieldValue, byteLength] = readElement(view, type, byteLength)
value[`field_${fid}`] = fieldValue
2023-12-29 19:17:58 +00:00
}
2024-01-03 01:16:33 +00:00
return { value, byteLength }
2023-12-29 19:17:58 +00:00
}
/**
* Read a single element based on its type
2024-01-04 17:50:42 +00:00
*
* @param {DataView} view
* @param {number} type
* @param {number} index
* @returns {[any, number]} [value, newIndex]
2023-12-29 19:17:58 +00:00
*/
2024-01-04 18:25:45 +00:00
function readElement(view, type, index) {
2023-12-29 19:17:58 +00:00
switch (type) {
case CompactType.TRUE:
return [true, index]
case CompactType.FALSE:
return [false, index]
case CompactType.BYTE:
// read byte directly
return [view.getInt8(index), index + 1]
case CompactType.I16:
case CompactType.I32:
return readZigZag(view, index)
case CompactType.I64:
return readZigZagBigInt(view, index)
case CompactType.DOUBLE:
return [view.getFloat64(index, true), index + 8]
case CompactType.BINARY: {
// strings are encoded as utf-8, no \0 delimiter
const [stringLength, stringIndex] = readVarInt(view, index)
const strBytes = new Uint8Array(view.buffer, stringIndex, stringLength)
return [new TextDecoder().decode(strBytes), stringIndex + stringLength]
}
case CompactType.LIST: {
const [elemType, listSize, listIndex] = readCollectionBegin(view, index)
index = listIndex
const listValues = []
for (let i = 0; i < listSize; i++) {
let listElem
[listElem, index] = readElement(view, elemType, index)
listValues.push(listElem)
}
return [listValues, index]
}
case CompactType.STRUCT: {
2024-01-04 18:06:50 +00:00
/** @type {Record<string, any>} */
2024-01-04 18:25:45 +00:00
const structValues = {}
2023-12-29 19:17:58 +00:00
let structLastFid = 0
while (true) {
let structFieldType, structFid, structIndex
[structFieldType, structFid, structIndex, structLastFid] = readFieldBegin(view, index, structLastFid)
index = structIndex
if (structFieldType === CompactType.STOP) {
break
}
let structFieldValue
[structFieldValue, index] = readElement(view, structFieldType, index)
structValues[`field_${structFid}`] = structFieldValue
}
return [structValues, index]
}
// TODO: MAP and SET
case CompactType.UUID: {
// Read 16 bytes to uuid string
let uuid = ''
for (let i = 0; i < 16; i++) {
uuid += view.getUint8(index++).toString(16).padStart(2, '0')
}
return [uuid, index]
}
default:
throw new Error(`Unhandled type: ${type}`)
}
}
/**
* Var int, also known as Unsigned LEB128.
* Var ints take 1 to 5 bytes (int32) or 1 to 10 bytes (int64).
* Takes a Big Endian unsigned integer, left-pads the bit-string to make it a
* multiple of 7 bits, splits it into 7-bit groups, prefix the most-significant
* 7-bit group with the 0 bit, prefixing the remaining 7-bit groups with the
* 1 bit and encode the resulting bit-string as Little Endian.
2024-01-04 17:50:42 +00:00
*
* @param {DataView} view
* @param {number} index
* @returns {[number, number]} [value, newIndex]
2023-12-29 19:17:58 +00:00
*/
2024-01-04 18:25:45 +00:00
function readVarInt(view, index) {
2023-12-29 19:17:58 +00:00
let result = 0
let shift = 0
while (true) {
const byte = view.getUint8(index++)
result |= (byte & 0x7f) << shift
if ((byte & 0x80) === 0) {
return [result, index]
}
shift += 7
}
}
/**
* Read a varint as a bigint.
2024-01-04 17:50:42 +00:00
*
* @param {DataView} view
* @param {number} index
* @returns {[bigint, number]} [value, newIndex]
2023-12-29 19:17:58 +00:00
*/
2024-01-04 18:25:45 +00:00
function readVarBigInt(view, index) {
2023-12-29 19:17:58 +00:00
let result = BigInt(0)
let shift = BigInt(0)
while (true) {
const byte = BigInt(view.getUint8(index++))
result |= (byte & BigInt(0x7f)) << shift
if ((byte & BigInt(0x80)) === BigInt(0)) {
return [result, index]
}
shift += BigInt(7)
}
}
/**
* Values of type int32 and int64 are transformed to a zigzag int.
* A zigzag int folds positive and negative numbers into the positive number space.
2024-01-04 17:50:42 +00:00
*
* @param {DataView} view
* @param {number} index
* @returns {[number, number]} [value, newIndex]
2023-12-29 19:17:58 +00:00
*/
2024-01-04 18:25:45 +00:00
function readZigZag(view, index) {
2023-12-29 19:17:58 +00:00
const [zigzag, newIndex] = readVarInt(view, index)
// convert zigzag to int
const value = (zigzag >>> 1) ^ -(zigzag & 1)
return [value, newIndex]
}
/**
* A zigzag int folds positive and negative numbers into the positive number space.
* This version returns a BigInt.
2024-01-04 17:50:42 +00:00
*
* @param {DataView} view
* @param {number} index
* @returns {[bigint, number]} [value, newIndex]
2023-12-29 19:17:58 +00:00
*/
2024-01-04 18:25:45 +00:00
function readZigZagBigInt(view, index) {
2023-12-29 19:17:58 +00:00
const [zigzag, newIndex] = readVarBigInt(view, index)
// convert zigzag to int
const value = (zigzag >> BigInt(1)) ^ -(zigzag & BigInt(1))
return [value, newIndex]
}
/**
* Get thrift type from half a byte
2024-01-04 17:50:42 +00:00
*
* @param {number} byte
* @returns {number}
2023-12-29 19:17:58 +00:00
*/
2024-01-04 18:25:45 +00:00
function getCompactType(byte) {
2023-12-29 19:17:58 +00:00
return byte & 0x0f
}
/**
* Read field type and field id
2024-01-04 17:50:42 +00:00
*
* @param {DataView} view
* @param {number} index
* @param {number} lastFid
* @returns {[number, number, number, number]} [type, fid, newIndex, newLastFid]
2023-12-29 19:17:58 +00:00
*/
2024-01-04 18:25:45 +00:00
function readFieldBegin(view, index, lastFid) {
2023-12-29 19:17:58 +00:00
const type = view.getUint8(index++)
if ((type & 0x0f) === CompactType.STOP) {
// STOP also ends a struct
return [0, 0, index, lastFid]
}
const delta = type >> 4
let fid // field id
if (delta === 0) {
// not a delta, read zigzag varint field id
[fid, index] = readZigZag(view, index)
} else {
// add delta to last field id
fid = lastFid + delta
}
return [getCompactType(type), fid, index, fid]
}
2024-01-04 17:50:42 +00:00
/**
* Read collection type and size
*
* @param {DataView} view
* @param {number} index
* @returns {[number, number, number]} [type, size, newIndex]
*/
2024-01-04 18:25:45 +00:00
function readCollectionBegin(view, index) {
2023-12-29 19:17:58 +00:00
const sizeType = view.getUint8(index++)
const size = sizeType >> 4
const type = getCompactType(sizeType)
if (size === 15) {
const [newSize, newIndex] = readVarInt(view, index)
return [type, newSize, newIndex]
}
return [type, size, index]
}
/**
* Convert int to varint. Outputs 1-5 bytes for int32.
2024-01-04 17:50:42 +00:00
*
* @param {number} n
* @returns {number[]}
2023-12-29 19:17:58 +00:00
*/
2024-01-04 18:25:45 +00:00
export function toVarInt(n) {
2023-12-29 19:17:58 +00:00
let idx = 0
const varInt = []
while (true) {
if ((n & ~0x7f) === 0) {
varInt[idx++] = n
break
} else {
varInt[idx++] = (n & 0x7f) | 0x80
n >>>= 7
}
}
return varInt
}