diff --git a/package.json b/package.json index 1d8744c..d942d71 100644 --- a/package.json +++ b/package.json @@ -40,7 +40,7 @@ "test": "vitest run" }, "dependencies": { - "hyparquet": "1.10.3" + "hyparquet": "1.10.4" }, "devDependencies": { "@babel/eslint-parser": "7.27.0", diff --git a/src/metadata.js b/src/metadata.js index 742c176..4ba9025 100644 --- a/src/metadata.js +++ b/src/metadata.js @@ -69,7 +69,7 @@ export function writeMetadata(writer, metadata) { })), field_5: rg.file_offset, field_6: rg.total_compressed_size, - field_7: rg.ordinal, + // field_7: rg.ordinal, // should be int16 })), field_5: metadata.key_value_metadata && metadata.key_value_metadata.map(kv => ({ field_1: kv.key, diff --git a/src/thrift.js b/src/thrift.js index ab626e9..6fc8146 100644 --- a/src/thrift.js +++ b/src/thrift.js @@ -21,6 +21,7 @@ const CompactType = { * * Expects keys named like "field_1", "field_2", etc. in ascending order. * + * @import {ThriftType} from 'hyparquet/src/types.js' * @import {Writer} from '../src/types.js' * @param {Writer} writer * @param {Record} data @@ -65,35 +66,15 @@ export function serializeTCompactProtocol(writer, data) { * @returns {number} CompactType */ function getCompactTypeForValue(value) { - if (value === true) { - return CompactType.TRUE - } - if (value === false) { - return CompactType.FALSE - } - if (typeof value === 'number') { - // We'll store integer as I32, otherwise DOUBLE - return Number.isInteger(value) ? CompactType.I32 : CompactType.DOUBLE - } - if (typeof value === 'bigint') { - return CompactType.I64 - } - if (typeof value === 'string') { - // Possibly treat 32-hex as a 16-byte UUID - if (/^[0-9a-fA-F]{32}$/.test(value)) { - return CompactType.UUID - } - return CompactType.BINARY - } - if (value instanceof Uint8Array) { - return CompactType.BINARY - } - if (Array.isArray(value)) { - return CompactType.LIST - } - if (value && typeof value === 'object') { - return CompactType.STRUCT - } + if (value === true) return CompactType.TRUE + if (value === false) return CompactType.FALSE + if (Number.isInteger(value)) return CompactType.I32 + if (typeof value === 'number') return CompactType.DOUBLE + if (typeof value === 'bigint') return CompactType.I64 + if (typeof value === 'string') return CompactType.BINARY + if (value instanceof Uint8Array) return CompactType.BINARY + if (Array.isArray(value)) return CompactType.LIST + if (value && typeof value === 'object') return CompactType.STRUCT throw new Error(`Cannot determine thrift compact type for: ${value}`) } @@ -102,52 +83,36 @@ function getCompactTypeForValue(value) { * * @param {Writer} writer * @param {number} type - * @param {any} value + * @param {ThriftType} value */ function writeElement(writer, type, value) { - switch (type) { - case CompactType.TRUE: - case CompactType.FALSE: - return // true/false is stored in the type - case CompactType.BYTE: + // true/false is stored in the type + if (type === CompactType.TRUE) return + if (type === CompactType.FALSE) return + if (type === CompactType.BYTE && typeof value === 'number') { writer.appendUint8(value) - return - case CompactType.I16: - case CompactType.I32: { - // ZigZag -> varint - // For 32-bit int: zigzag = (n << 1) ^ (n >> 31) + } else if (type === CompactType.I32 && typeof value === 'number') { const zigzag = value << 1 ^ value >> 31 writer.appendVarInt(zigzag) - return - } - case CompactType.I64: { + } else if (type === CompactType.I64 && typeof value === 'bigint') { // For 64-bit (bigint) we do (value << 1n) ^ (value >> 63n) in zigzag - const n = BigInt(value) - const zigzag = n << 1n ^ n >> 63n + const zigzag = value << 1n ^ value >> 63n writer.appendVarBigInt(zigzag) - return - } - case CompactType.DOUBLE: + } else if (type === CompactType.DOUBLE && typeof value === 'number') { writer.appendFloat64(value) - return - case CompactType.BINARY: { + } else if (type === CompactType.BINARY && typeof value === 'string') { // store length as a varint, then raw bytes - let bytes - if (typeof value === 'string') { - bytes = new TextEncoder().encode(value) - } else { - // e.g. Uint8Array - bytes = value - } + const bytes = new TextEncoder().encode(value) writer.appendVarInt(bytes.length) - writer.appendBuffer(bytes) - return - } - case CompactType.LIST: { + writer.appendBytes(bytes) + } else if (type === CompactType.BINARY && value instanceof Uint8Array) { + // store length as a varint, then raw bytes + writer.appendVarInt(value.byteLength) + writer.appendBytes(value) + } else if (type === CompactType.LIST && Array.isArray(value)) { // Must store (size << 4) | elementType // We'll guess the element type from the first element - const arr = value - const size = arr.length + const size = value.length if (size === 0) { // (0 << 4) | type for an empty list – pick BYTE arbitrarily writer.appendUint8(0 << 4 | CompactType.BYTE) @@ -155,7 +120,7 @@ function writeElement(writer, type, value) { } // TODO: Check for heterogeneous lists? - const elemType = getCompactTypeForValue(arr[0]) + const elemType = getCompactTypeForValue(value[0]) const sizeNibble = size > 14 ? 15 : size writer.appendUint8(sizeNibble << 4 | elemType) @@ -166,18 +131,16 @@ function writeElement(writer, type, value) { // Special trick for booleans in a list if (elemType === CompactType.TRUE || elemType === CompactType.FALSE) { // Write each boolean as a single 0 or 1 byte - for (const v of arr) { + for (const v of value) { writer.appendUint8(v ? 1 : 0) } } else { // Otherwise write them out normally - for (const v of arr) { + for (const v of value) { writeElement(writer, elemType, v) } } - return - } - case CompactType.STRUCT: { + } else if (type === CompactType.STRUCT && typeof value === 'object') { // Recursively write sub-fields as "field_N: val", end with STOP let lastFid = 0 for (const [k, v] of Object.entries(value)) { @@ -198,21 +161,7 @@ function writeElement(writer, type, value) { } // Write STOP writer.appendUint8(CompactType.STOP) - return - } - case CompactType.UUID: { - // Expect a 32-hex string. Write 16 bytes - if (typeof value !== 'string' || value.length !== 32) { - throw new Error(`Expected 32-hex string for UUID, got ${value}`) - } - for (let i = 0; i < 16; i++) { - const byte = parseInt(value.slice(i * 2, i * 2 + 2), 16) - writer.appendUint8(byte) - } - return - } - - default: - throw new Error(`Unhandled type in writeElement: ${type}`) + } else { + throw new Error(`unhandled type in writeElement: ${type} for value ${value}`) } } diff --git a/test/thrift.test.js b/test/thrift.test.js index 049429c..6e07a19 100644 --- a/test/thrift.test.js +++ b/test/thrift.test.js @@ -24,8 +24,8 @@ describe('serializeTCompactProtocol', () => { field_5: 0x7fffffff, // I32 field_6: BigInt('0x7fffffffffffffff'), // I64 field_7: 123.456, // DOUBLE - // BINARY (string as Uint8Array): - field_8: new TextEncoder().encode('Hello, Thrift!'), + field_8: 'Hello, Thrift!', + field_9: new TextEncoder().encode('Hello, Thrift!'), } const writer = new Writer() @@ -41,8 +41,9 @@ describe('serializeTCompactProtocol', () => { expect(result.field_6).toBe(BigInt('0x7fffffffffffffff')) expect(result.field_7).toBeCloseTo(123.456) // Decode the binary back into a string - const decodedString = new TextDecoder().decode(result.field_8) - expect(decodedString).toBe('Hello, Thrift!') + const decoder = new TextDecoder() + expect(decoder.decode(result.field_8)).toBe('Hello, Thrift!') + expect(decoder.decode(result.field_9)).toBe('Hello, Thrift!') }) it('serializes a nested STRUCT and LIST of booleans', () => { @@ -69,20 +70,6 @@ describe('serializeTCompactProtocol', () => { expect(result.field_2).toEqual([true, false, true, false]) }) - it('serializes a UUID correctly', () => { - // 32 hex chars => 16 bytes - const uuidHex = '00112233445566778899aabbccddeeff' - const data = { field_1: uuidHex } - - const writer = new Writer() - serializeTCompactProtocol(writer, data) - const buf = writer.buffer.slice(0, writer.offset) - const result = roundTripDeserialize(buf) - - // Should come back as the same string - expect(result.field_1).toBe(uuidHex) - }) - it('handles empty object (only STOP)', () => { const data = {} const writer = new Writer()