mirror of
https://github.com/asadbek064/hyparquet-writer.git
synced 2025-12-28 07:56:39 +00:00
Type thrift
This commit is contained in:
parent
6727628aad
commit
cec00d9699
@ -40,7 +40,7 @@
|
||||
"test": "vitest run"
|
||||
},
|
||||
"dependencies": {
|
||||
"hyparquet": "1.10.3"
|
||||
"hyparquet": "1.10.4"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@babel/eslint-parser": "7.27.0",
|
||||
|
||||
@ -69,7 +69,7 @@ export function writeMetadata(writer, metadata) {
|
||||
})),
|
||||
field_5: rg.file_offset,
|
||||
field_6: rg.total_compressed_size,
|
||||
field_7: rg.ordinal,
|
||||
// field_7: rg.ordinal, // should be int16
|
||||
})),
|
||||
field_5: metadata.key_value_metadata && metadata.key_value_metadata.map(kv => ({
|
||||
field_1: kv.key,
|
||||
|
||||
119
src/thrift.js
119
src/thrift.js
@ -21,6 +21,7 @@ const CompactType = {
|
||||
*
|
||||
* Expects keys named like "field_1", "field_2", etc. in ascending order.
|
||||
*
|
||||
* @import {ThriftType} from 'hyparquet/src/types.js'
|
||||
* @import {Writer} from '../src/types.js'
|
||||
* @param {Writer} writer
|
||||
* @param {Record<string, any>} data
|
||||
@ -65,35 +66,15 @@ export function serializeTCompactProtocol(writer, data) {
|
||||
* @returns {number} CompactType
|
||||
*/
|
||||
function getCompactTypeForValue(value) {
|
||||
if (value === true) {
|
||||
return CompactType.TRUE
|
||||
}
|
||||
if (value === false) {
|
||||
return CompactType.FALSE
|
||||
}
|
||||
if (typeof value === 'number') {
|
||||
// We'll store integer as I32, otherwise DOUBLE
|
||||
return Number.isInteger(value) ? CompactType.I32 : CompactType.DOUBLE
|
||||
}
|
||||
if (typeof value === 'bigint') {
|
||||
return CompactType.I64
|
||||
}
|
||||
if (typeof value === 'string') {
|
||||
// Possibly treat 32-hex as a 16-byte UUID
|
||||
if (/^[0-9a-fA-F]{32}$/.test(value)) {
|
||||
return CompactType.UUID
|
||||
}
|
||||
return CompactType.BINARY
|
||||
}
|
||||
if (value instanceof Uint8Array) {
|
||||
return CompactType.BINARY
|
||||
}
|
||||
if (Array.isArray(value)) {
|
||||
return CompactType.LIST
|
||||
}
|
||||
if (value && typeof value === 'object') {
|
||||
return CompactType.STRUCT
|
||||
}
|
||||
if (value === true) return CompactType.TRUE
|
||||
if (value === false) return CompactType.FALSE
|
||||
if (Number.isInteger(value)) return CompactType.I32
|
||||
if (typeof value === 'number') return CompactType.DOUBLE
|
||||
if (typeof value === 'bigint') return CompactType.I64
|
||||
if (typeof value === 'string') return CompactType.BINARY
|
||||
if (value instanceof Uint8Array) return CompactType.BINARY
|
||||
if (Array.isArray(value)) return CompactType.LIST
|
||||
if (value && typeof value === 'object') return CompactType.STRUCT
|
||||
throw new Error(`Cannot determine thrift compact type for: ${value}`)
|
||||
}
|
||||
|
||||
@ -102,52 +83,36 @@ function getCompactTypeForValue(value) {
|
||||
*
|
||||
* @param {Writer} writer
|
||||
* @param {number} type
|
||||
* @param {any} value
|
||||
* @param {ThriftType} value
|
||||
*/
|
||||
function writeElement(writer, type, value) {
|
||||
switch (type) {
|
||||
case CompactType.TRUE:
|
||||
case CompactType.FALSE:
|
||||
return // true/false is stored in the type
|
||||
case CompactType.BYTE:
|
||||
// true/false is stored in the type
|
||||
if (type === CompactType.TRUE) return
|
||||
if (type === CompactType.FALSE) return
|
||||
if (type === CompactType.BYTE && typeof value === 'number') {
|
||||
writer.appendUint8(value)
|
||||
return
|
||||
case CompactType.I16:
|
||||
case CompactType.I32: {
|
||||
// ZigZag -> varint
|
||||
// For 32-bit int: zigzag = (n << 1) ^ (n >> 31)
|
||||
} else if (type === CompactType.I32 && typeof value === 'number') {
|
||||
const zigzag = value << 1 ^ value >> 31
|
||||
writer.appendVarInt(zigzag)
|
||||
return
|
||||
}
|
||||
case CompactType.I64: {
|
||||
} else if (type === CompactType.I64 && typeof value === 'bigint') {
|
||||
// For 64-bit (bigint) we do (value << 1n) ^ (value >> 63n) in zigzag
|
||||
const n = BigInt(value)
|
||||
const zigzag = n << 1n ^ n >> 63n
|
||||
const zigzag = value << 1n ^ value >> 63n
|
||||
writer.appendVarBigInt(zigzag)
|
||||
return
|
||||
}
|
||||
case CompactType.DOUBLE:
|
||||
} else if (type === CompactType.DOUBLE && typeof value === 'number') {
|
||||
writer.appendFloat64(value)
|
||||
return
|
||||
case CompactType.BINARY: {
|
||||
} else if (type === CompactType.BINARY && typeof value === 'string') {
|
||||
// store length as a varint, then raw bytes
|
||||
let bytes
|
||||
if (typeof value === 'string') {
|
||||
bytes = new TextEncoder().encode(value)
|
||||
} else {
|
||||
// e.g. Uint8Array
|
||||
bytes = value
|
||||
}
|
||||
const bytes = new TextEncoder().encode(value)
|
||||
writer.appendVarInt(bytes.length)
|
||||
writer.appendBuffer(bytes)
|
||||
return
|
||||
}
|
||||
case CompactType.LIST: {
|
||||
writer.appendBytes(bytes)
|
||||
} else if (type === CompactType.BINARY && value instanceof Uint8Array) {
|
||||
// store length as a varint, then raw bytes
|
||||
writer.appendVarInt(value.byteLength)
|
||||
writer.appendBytes(value)
|
||||
} else if (type === CompactType.LIST && Array.isArray(value)) {
|
||||
// Must store (size << 4) | elementType
|
||||
// We'll guess the element type from the first element
|
||||
const arr = value
|
||||
const size = arr.length
|
||||
const size = value.length
|
||||
if (size === 0) {
|
||||
// (0 << 4) | type for an empty list – pick BYTE arbitrarily
|
||||
writer.appendUint8(0 << 4 | CompactType.BYTE)
|
||||
@ -155,7 +120,7 @@ function writeElement(writer, type, value) {
|
||||
}
|
||||
|
||||
// TODO: Check for heterogeneous lists?
|
||||
const elemType = getCompactTypeForValue(arr[0])
|
||||
const elemType = getCompactTypeForValue(value[0])
|
||||
|
||||
const sizeNibble = size > 14 ? 15 : size
|
||||
writer.appendUint8(sizeNibble << 4 | elemType)
|
||||
@ -166,18 +131,16 @@ function writeElement(writer, type, value) {
|
||||
// Special trick for booleans in a list
|
||||
if (elemType === CompactType.TRUE || elemType === CompactType.FALSE) {
|
||||
// Write each boolean as a single 0 or 1 byte
|
||||
for (const v of arr) {
|
||||
for (const v of value) {
|
||||
writer.appendUint8(v ? 1 : 0)
|
||||
}
|
||||
} else {
|
||||
// Otherwise write them out normally
|
||||
for (const v of arr) {
|
||||
for (const v of value) {
|
||||
writeElement(writer, elemType, v)
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
case CompactType.STRUCT: {
|
||||
} else if (type === CompactType.STRUCT && typeof value === 'object') {
|
||||
// Recursively write sub-fields as "field_N: val", end with STOP
|
||||
let lastFid = 0
|
||||
for (const [k, v] of Object.entries(value)) {
|
||||
@ -198,21 +161,7 @@ function writeElement(writer, type, value) {
|
||||
}
|
||||
// Write STOP
|
||||
writer.appendUint8(CompactType.STOP)
|
||||
return
|
||||
}
|
||||
case CompactType.UUID: {
|
||||
// Expect a 32-hex string. Write 16 bytes
|
||||
if (typeof value !== 'string' || value.length !== 32) {
|
||||
throw new Error(`Expected 32-hex string for UUID, got ${value}`)
|
||||
}
|
||||
for (let i = 0; i < 16; i++) {
|
||||
const byte = parseInt(value.slice(i * 2, i * 2 + 2), 16)
|
||||
writer.appendUint8(byte)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
default:
|
||||
throw new Error(`Unhandled type in writeElement: ${type}`)
|
||||
} else {
|
||||
throw new Error(`unhandled type in writeElement: ${type} for value ${value}`)
|
||||
}
|
||||
}
|
||||
|
||||
@ -24,8 +24,8 @@ describe('serializeTCompactProtocol', () => {
|
||||
field_5: 0x7fffffff, // I32
|
||||
field_6: BigInt('0x7fffffffffffffff'), // I64
|
||||
field_7: 123.456, // DOUBLE
|
||||
// BINARY (string as Uint8Array):
|
||||
field_8: new TextEncoder().encode('Hello, Thrift!'),
|
||||
field_8: 'Hello, Thrift!',
|
||||
field_9: new TextEncoder().encode('Hello, Thrift!'),
|
||||
}
|
||||
|
||||
const writer = new Writer()
|
||||
@ -41,8 +41,9 @@ describe('serializeTCompactProtocol', () => {
|
||||
expect(result.field_6).toBe(BigInt('0x7fffffffffffffff'))
|
||||
expect(result.field_7).toBeCloseTo(123.456)
|
||||
// Decode the binary back into a string
|
||||
const decodedString = new TextDecoder().decode(result.field_8)
|
||||
expect(decodedString).toBe('Hello, Thrift!')
|
||||
const decoder = new TextDecoder()
|
||||
expect(decoder.decode(result.field_8)).toBe('Hello, Thrift!')
|
||||
expect(decoder.decode(result.field_9)).toBe('Hello, Thrift!')
|
||||
})
|
||||
|
||||
it('serializes a nested STRUCT and LIST of booleans', () => {
|
||||
@ -69,20 +70,6 @@ describe('serializeTCompactProtocol', () => {
|
||||
expect(result.field_2).toEqual([true, false, true, false])
|
||||
})
|
||||
|
||||
it('serializes a UUID correctly', () => {
|
||||
// 32 hex chars => 16 bytes
|
||||
const uuidHex = '00112233445566778899aabbccddeeff'
|
||||
const data = { field_1: uuidHex }
|
||||
|
||||
const writer = new Writer()
|
||||
serializeTCompactProtocol(writer, data)
|
||||
const buf = writer.buffer.slice(0, writer.offset)
|
||||
const result = roundTripDeserialize(buf)
|
||||
|
||||
// Should come back as the same string
|
||||
expect(result.field_1).toBe(uuidHex)
|
||||
})
|
||||
|
||||
it('handles empty object (only STOP)', () => {
|
||||
const data = {}
|
||||
const writer = new Writer()
|
||||
|
||||
Loading…
Reference in New Issue
Block a user