hyparquet-writer/src/thrift.js

160 lines
5.4 KiB
JavaScript
Raw Normal View History

2025-04-15 06:40:27 +00:00
import { CompactType } from 'hyparquet/src/thrift.js'
2025-03-25 17:27:15 +00:00
/**
* Serialize a JS object in TCompactProtocol format.
*
* Expects keys named like "field_1", "field_2", etc. in ascending order.
*
2025-04-04 00:00:46 +00:00
* @import {ThriftType} from 'hyparquet/src/types.js'
* @import {Writer} from '../src/types.js'
2025-03-25 17:27:15 +00:00
* @param {Writer} writer
* @param {Record<string, any>} data
*/
export function serializeTCompactProtocol(writer, data) {
let lastFid = 0
2025-04-15 06:40:27 +00:00
// write each field
2025-03-25 17:27:15 +00:00
for (const [key, value] of Object.entries(data)) {
2025-03-26 00:49:59 +00:00
if (value === undefined) continue
2025-04-15 06:40:27 +00:00
// we expect key = "field_N" so we can extract N as the field ID
2025-03-25 17:27:15 +00:00
const fid = parseInt(key.replace(/^field_/, ''), 10)
if (Number.isNaN(fid)) {
2025-04-15 06:40:27 +00:00
throw new Error(`thrift invalid field name: ${key}. Expected "field_###".`)
2025-03-25 17:27:15 +00:00
}
2025-04-15 06:40:27 +00:00
// write the field-begin header
2025-03-25 17:27:15 +00:00
const type = getCompactTypeForValue(value)
const delta = fid - lastFid
if (delta <= 0) {
2025-04-15 06:40:27 +00:00
throw new Error(`thrift non-monotonic field ID: fid=${fid}, lastFid=${lastFid}`)
2025-03-25 17:27:15 +00:00
}
2025-10-23 17:46:27 +00:00
// high nibble = delta, low nibble = type < 15 or zigzag
if (delta <= 15) {
writer.appendUint8(delta << 4 | type)
} else {
writer.appendUint8(type)
writer.appendVarInt(fid << 1 ^ fid >> 15) // zigzag
}
2025-03-25 17:27:15 +00:00
// Write the field content itself
writeElement(writer, type, value)
lastFid = fid
}
// Finally write STOP
writer.appendUint8(CompactType.STOP)
}
/**
* Deduce a TCompactProtocol type from the JS value
*
* @param {any} value
* @returns {number} CompactType
*/
function getCompactTypeForValue(value) {
2025-04-04 00:00:46 +00:00
if (value === true) return CompactType.TRUE
if (value === false) return CompactType.FALSE
if (Number.isInteger(value)) return CompactType.I32
if (typeof value === 'number') return CompactType.DOUBLE
if (typeof value === 'bigint') return CompactType.I64
if (typeof value === 'string') return CompactType.BINARY
if (value instanceof Uint8Array) return CompactType.BINARY
if (Array.isArray(value)) return CompactType.LIST
if (value && typeof value === 'object') return CompactType.STRUCT
2025-03-25 17:27:15 +00:00
throw new Error(`Cannot determine thrift compact type for: ${value}`)
}
/**
* Write a single value of a given compact type.
*
* @param {Writer} writer
* @param {number} type
2025-04-04 00:00:46 +00:00
* @param {ThriftType} value
2025-03-25 17:27:15 +00:00
*/
function writeElement(writer, type, value) {
2025-04-04 00:00:46 +00:00
// true/false is stored in the type
if (type === CompactType.TRUE) return
if (type === CompactType.FALSE) return
if (type === CompactType.BYTE && typeof value === 'number') {
2025-03-25 17:27:15 +00:00
writer.appendUint8(value)
2025-04-04 00:00:46 +00:00
} else if (type === CompactType.I32 && typeof value === 'number') {
2025-03-25 17:27:15 +00:00
const zigzag = value << 1 ^ value >> 31
writer.appendVarInt(zigzag)
2025-04-04 00:00:46 +00:00
} else if (type === CompactType.I64 && typeof value === 'bigint') {
2025-03-25 17:27:15 +00:00
// For 64-bit (bigint) we do (value << 1n) ^ (value >> 63n) in zigzag
2025-04-04 00:00:46 +00:00
const zigzag = value << 1n ^ value >> 63n
2025-03-25 17:27:15 +00:00
writer.appendVarBigInt(zigzag)
2025-04-04 00:00:46 +00:00
} else if (type === CompactType.DOUBLE && typeof value === 'number') {
2025-03-25 17:27:15 +00:00
writer.appendFloat64(value)
2025-04-04 00:00:46 +00:00
} else if (type === CompactType.BINARY && typeof value === 'string') {
2025-03-25 17:27:15 +00:00
// store length as a varint, then raw bytes
2025-04-04 00:00:46 +00:00
const bytes = new TextEncoder().encode(value)
2025-03-25 17:27:15 +00:00
writer.appendVarInt(bytes.length)
2025-04-04 00:00:46 +00:00
writer.appendBytes(bytes)
} else if (type === CompactType.BINARY && value instanceof Uint8Array) {
// store length as a varint, then raw bytes
writer.appendVarInt(value.byteLength)
writer.appendBytes(value)
} else if (type === CompactType.LIST && Array.isArray(value)) {
2025-03-25 17:27:15 +00:00
// Must store (size << 4) | elementType
// We'll guess the element type from the first element
2025-04-04 00:00:46 +00:00
const size = value.length
2025-03-25 17:27:15 +00:00
if (size === 0) {
// (0 << 4) | type for an empty list pick BYTE arbitrarily
writer.appendUint8(0 << 4 | CompactType.BYTE)
return
}
// TODO: Check for heterogeneous lists?
2025-04-04 00:00:46 +00:00
const elemType = getCompactTypeForValue(value[0])
2025-03-25 17:27:15 +00:00
const sizeNibble = size > 14 ? 15 : size
writer.appendUint8(sizeNibble << 4 | elemType)
if (size > 14) {
writer.appendVarInt(size)
}
// Special trick for booleans in a list
if (elemType === CompactType.TRUE || elemType === CompactType.FALSE) {
// Write each boolean as a single 0 or 1 byte
2025-04-04 00:00:46 +00:00
for (const v of value) {
2025-03-25 17:27:15 +00:00
writer.appendUint8(v ? 1 : 0)
}
} else {
// Otherwise write them out normally
2025-04-04 00:00:46 +00:00
for (const v of value) {
2025-03-25 17:27:15 +00:00
writeElement(writer, elemType, v)
}
}
2025-04-04 00:00:46 +00:00
} else if (type === CompactType.STRUCT && typeof value === 'object') {
2025-03-25 17:27:15 +00:00
// Recursively write sub-fields as "field_N: val", end with STOP
let lastFid = 0
for (const [k, v] of Object.entries(value)) {
2025-03-26 00:49:59 +00:00
if (v === undefined) continue
2025-03-25 17:27:15 +00:00
const fid = parseInt(k.replace(/^field_/, ''), 10)
if (Number.isNaN(fid)) {
throw new Error(`Invalid sub-field name: ${k}. Expected "field_###"`)
}
const t = getCompactTypeForValue(v)
const delta = fid - lastFid
if (delta <= 0) {
throw new Error(`Non-monotonic fid in struct: fid=${fid}, lastFid=${lastFid}`)
}
2025-10-23 17:46:27 +00:00
if (delta <= 15) {
writer.appendUint8(delta << 4 | t)
} else {
writer.appendUint8(t)
writer.appendVarInt(fid << 1 ^ fid >> 15)
}
2025-03-25 17:27:15 +00:00
writeElement(writer, t, v)
lastFid = fid
}
// Write STOP
writer.appendUint8(CompactType.STOP)
2025-04-04 00:00:46 +00:00
} else {
throw new Error(`unhandled type in writeElement: ${type} for value ${value}`)
2025-03-25 17:27:15 +00:00
}
}