Fixed length byte array decimals

This commit is contained in:
Kenny Daniel 2025-04-11 17:26:07 -06:00
parent 279e055a60
commit fde7f81893
No known key found for this signature in database
GPG Key ID: FDF16101AF5AFD3A
4 changed files with 65 additions and 25 deletions

@ -49,7 +49,7 @@
},
"devDependencies": {
"@babel/eslint-parser": "7.27.0",
"@types/node": "22.14.0",
"@types/node": "22.14.1",
"@vitest/coverage-v8": "3.1.1",
"eslint": "9.24.0",
"eslint-plugin-jsdoc": "50.6.9",

@ -20,7 +20,7 @@ export function writePlain(writer, values, type) {
} else if (type === 'BYTE_ARRAY') {
writePlainByteArray(writer, values)
} else if (type === 'FIXED_LEN_BYTE_ARRAY') {
writePlainByteArray(writer, values)
writePlainByteArrayFixed(writer, values)
} else {
throw new Error(`parquet unsupported type: ${type}`)
}
@ -109,3 +109,14 @@ function writePlainByteArray(writer, values) {
writer.appendBytes(value)
}
}
/**
* @param {Writer} writer
* @param {DecodedArray} values
*/
function writePlainByteArrayFixed(writer, values) {
for (const value of values) {
if (!(value instanceof Uint8Array)) throw new Error('parquet expected Uint8Array value')
writer.appendBytes(value)
}
}

@ -4,19 +4,19 @@ const dayMillis = 86400000 // 1 day in milliseconds
* Convert from rich to primitive types.
*
* @import {DecodedArray, SchemaElement} from 'hyparquet'
* @param {SchemaElement} schemaElement
* @param {SchemaElement} element
* @param {DecodedArray} values
* @returns {DecodedArray}
*/
export function unconvert(schemaElement, values) {
const ctype = schemaElement.converted_type
export function unconvert(element, values) {
const ctype = element.converted_type
if (ctype === 'DECIMAL') {
const scale = schemaElement.scale || 0
const scale = element.scale || 0
const factor = 10 ** scale
return values.map(v => {
if (v === null || v === undefined) return v
if (typeof v !== 'number') throw new Error('DECIMAL must be a number')
return unconvertDecimal(BigInt(Math.round(v * factor))) // to byte array
return unconvertDecimal(element, BigInt(Math.round(v * factor)))
})
}
if (ctype === 'DATE') {
@ -45,12 +45,12 @@ export function unconvert(schemaElement, values) {
* Uncovert from rich type to byte array for metadata statistics.
*
* @param {import('hyparquet/src/types.js').MinMaxType | undefined} value
* @param {SchemaElement} schema
* @param {SchemaElement} element
* @returns {Uint8Array | undefined}
*/
export function unconvertMetadata(value, schema) {
export function unconvertMetadata(value, element) {
if (value === undefined || value === null) return undefined
const { type, converted_type } = schema
const { type, converted_type } = element
if (type === 'BOOLEAN') return new Uint8Array([value ? 1 : 0])
if (type === 'BYTE_ARRAY' || type === 'FIXED_LEN_BYTE_ARRAY') {
// truncate byte arrays to 16 bytes for statistics
@ -91,25 +91,34 @@ export function unconvertMetadata(value, schema) {
}
/**
* @param {SchemaElement} element
* @param {bigint} value
* @returns {Uint8Array}
* @returns {number | bigint | Uint8Array}
*/
export function unconvertDecimal(value) {
if (value === 0n) return new Uint8Array([])
const bytes = []
let current = value
export function unconvertDecimal({ type, type_length }, value) {
if (type === 'INT32') return Number(value)
if (type === 'INT64') return value
if (type === 'FIXED_LEN_BYTE_ARRAY' && !type_length) {
throw new Error('fixed length byte array type_length is required')
}
if (!type_length && !value) return new Uint8Array()
const bytes = []
while (true) {
// extract the lowest 8 bits
const byte = Number(current & 0xffn)
const byte = Number(value & 0xffn)
bytes.unshift(byte)
current >>= 8n
value >>= 8n
// for nonnegative: stop when top byte has signBit = 0 AND shifted value == 0n
// for negative: stop when top byte has signBit = 1 AND shifted value == -1n
const signBit = byte & 0x80
if (!signBit && current === 0n || signBit && current === -1n) {
break
if (type_length) {
if (bytes.length >= type_length) break // fixed length
} else {
// for nonnegative: stop when top byte has signBit = 0 AND shifted value == 0n
// for negative: stop when top byte has signBit = 1 AND shifted value == -1n
const signBit = byte & 0x80
if (!signBit && value === 0n || signBit && value === -1n) {
break
}
}
}

@ -166,21 +166,41 @@ describe('unconvertDecimal', () => {
{ input: 1234567890123456789n, expected: new Uint8Array([0x11, 0x22, 0x10, 0xf4, 0x7d, 0xe9, 0x81, 0x15]) },
{ input: -1234567890123456789n, expected: new Uint8Array([0xee, 0xdd, 0xef, 0x0b, 0x82, 0x16, 0x7e, 0xeb]) },
]
/** @type {SchemaElement} */
const element = {
name: 'col',
type: 'BYTE_ARRAY',
}
it.for(examples)('should convert %p', ({ input, expected }) => {
expect(parseDecimal(expected)).toEqual(input)
})
it.for(examples)('should unconvert %p', ({ input, expected }) => {
expect(unconvertDecimal(input)).toEqual(expected)
expect(unconvertDecimal(element, input)).toEqual(expected)
})
it.for(examples)('should roundtrip %p', ({ input }) => {
expect(parseDecimal(unconvertDecimal(input))).toEqual(input)
const byteArray = unconvertDecimal(element, input)
if (!(byteArray instanceof Uint8Array)) throw new Error('expected Uint8Array')
expect(parseDecimal(byteArray)).toEqual(input)
})
it.for(examples)('should reverse roundtrip %p', ({ expected }) => {
expect(unconvertDecimal(parseDecimal(expected))).toEqual(expected)
expect(unconvertDecimal(element, parseDecimal(expected))).toEqual(expected)
})
it('convert to INT32', () => {
expect(unconvertDecimal({ name: 'col', type: 'INT32' }, 1234n)).toEqual(1234)
})
it('convert to INT64', () => {
expect(unconvertDecimal({ name: 'col', type: 'INT64' }, 1234n)).toEqual(1234n)
})
it('throws if fixed length is not specified', () => {
expect(() => unconvertDecimal({ name: 'col', type: 'FIXED_LEN_BYTE_ARRAY' }, 1234n))
.toThrow('fixed length byte array type_length is required')
})
})