Fix thrift encoding when delta > 15

This commit is contained in:
Kenny Daniel 2025-10-23 10:46:27 -07:00
parent e68f992101
commit a56c78de39
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
3 changed files with 39 additions and 14 deletions

@ -55,12 +55,12 @@
"hyparquet": "1.20.0"
},
"devDependencies": {
"@babel/eslint-parser": "7.28.4",
"@babel/eslint-parser": "7.28.5",
"@types/node": "24.9.1",
"@vitest/coverage-v8": "3.2.4",
"@vitest/coverage-v8": "4.0.2",
"eslint": "9.38.0",
"eslint-plugin-jsdoc": "61.1.5",
"eslint-plugin-jsdoc": "61.1.7",
"typescript": "5.9.3",
"vitest": "3.2.4"
"vitest": "4.0.2"
}
}

@ -28,8 +28,13 @@ export function serializeTCompactProtocol(writer, data) {
if (delta <= 0) {
throw new Error(`thrift non-monotonic field ID: fid=${fid}, lastFid=${lastFid}`)
}
// High nibble = delta, low nibble = type
writer.appendUint8(delta << 4 | type)
// high nibble = delta, low nibble = type < 15 or zigzag
if (delta <= 15) {
writer.appendUint8(delta << 4 | type)
} else {
writer.appendUint8(type)
writer.appendVarInt(fid << 1 ^ fid >> 15) // zigzag
}
// Write the field content itself
writeElement(writer, type, value)
@ -137,7 +142,12 @@ function writeElement(writer, type, value) {
if (delta <= 0) {
throw new Error(`Non-monotonic fid in struct: fid=${fid}, lastFid=${lastFid}`)
}
writer.appendUint8(delta << 4 | t & 0x0f)
if (delta <= 15) {
writer.appendUint8(delta << 4 | t)
} else {
writer.appendUint8(t)
writer.appendVarInt(fid << 1 ^ fid >> 15)
}
writeElement(writer, t, v)
lastFid = fid
}

@ -2,6 +2,7 @@ import { deserializeTCompactProtocol } from 'hyparquet/src/thrift.js'
import { describe, expect, it } from 'vitest'
import { serializeTCompactProtocol } from '../src/thrift.js'
import { ByteWriter } from '../src/bytewriter.js'
import { logicalType } from '../src/metadata.js'
/**
* Utility to decode a Thrift-serialized buffer and return the parsed object.
@ -30,8 +31,7 @@ describe('serializeTCompactProtocol', () => {
const writer = new ByteWriter()
serializeTCompactProtocol(writer, data)
const buf = writer.buffer.slice(0, writer.offset)
const result = roundTripDeserialize(buf)
const result = roundTripDeserialize(writer.getBuffer())
expect(result.field_1).toBe(true)
expect(result.field_2).toBe(false)
@ -61,8 +61,7 @@ describe('serializeTCompactProtocol', () => {
const writer = new ByteWriter()
serializeTCompactProtocol(writer, data)
const buf = writer.buffer.slice(0, writer.offset)
const result = roundTripDeserialize(buf)
const result = roundTripDeserialize(writer.getBuffer())
expect(result.field_1.field_1).toBe(42)
expect(result.field_1.field_2.field_1).toBe(true)
@ -74,13 +73,12 @@ describe('serializeTCompactProtocol', () => {
const data = {}
const writer = new ByteWriter()
serializeTCompactProtocol(writer, data)
const buf = writer.buffer.slice(0, writer.offset)
const arr = new Uint8Array(buf)
const arr = new Uint8Array(writer.getBuffer())
// The entire buffer should just be [0x00] = STOP
expect(arr).toEqual(new Uint8Array([0x00]))
// Round-trip: should deserialize to an empty object
const result = roundTripDeserialize(buf)
const result = roundTripDeserialize(writer.getBuffer())
expect(result).toEqual({})
})
@ -92,4 +90,21 @@ describe('serializeTCompactProtocol', () => {
const writer = new ByteWriter()
expect(() => serializeTCompactProtocol(writer, invalidData)).toThrow()
})
it('serializes field IDs with gaps larger than 15', () => {
const data = { field_1: 1, field_17: 17 }
const writer = new ByteWriter()
serializeTCompactProtocol(writer, data)
const result = roundTripDeserialize(writer.getBuffer())
expect(result.field_1).toBe(1)
expect(result.field_17).toBe(17)
})
it('serializes GEOMETRY logicalType struct with field_17', () => {
const data = { field_1: logicalType({ type: 'GEOMETRY' }) }
const writer = new ByteWriter()
serializeTCompactProtocol(writer, data)
const result = roundTripDeserialize(writer.getBuffer())
expect(result.field_1.field_17).toEqual({})
})
})