From fde7f81893047d1c07097a760a3f374e7fca2567 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Fri, 11 Apr 2025 17:26:07 -0600 Subject: [PATCH] Fixed length byte array decimals --- package.json | 2 +- src/plain.js | 13 ++++++++++- src/unconvert.js | 49 +++++++++++++++++++++++++----------------- test/unconvert.test.js | 26 +++++++++++++++++++--- 4 files changed, 65 insertions(+), 25 deletions(-) diff --git a/package.json b/package.json index 303246c..7cb72ce 100644 --- a/package.json +++ b/package.json @@ -49,7 +49,7 @@ }, "devDependencies": { "@babel/eslint-parser": "7.27.0", - "@types/node": "22.14.0", + "@types/node": "22.14.1", "@vitest/coverage-v8": "3.1.1", "eslint": "9.24.0", "eslint-plugin-jsdoc": "50.6.9", diff --git a/src/plain.js b/src/plain.js index d4776a7..97b91e3 100644 --- a/src/plain.js +++ b/src/plain.js @@ -20,7 +20,7 @@ export function writePlain(writer, values, type) { } else if (type === 'BYTE_ARRAY') { writePlainByteArray(writer, values) } else if (type === 'FIXED_LEN_BYTE_ARRAY') { - writePlainByteArray(writer, values) + writePlainByteArrayFixed(writer, values) } else { throw new Error(`parquet unsupported type: ${type}`) } @@ -109,3 +109,14 @@ function writePlainByteArray(writer, values) { writer.appendBytes(value) } } + +/** + * @param {Writer} writer + * @param {DecodedArray} values + */ +function writePlainByteArrayFixed(writer, values) { + for (const value of values) { + if (!(value instanceof Uint8Array)) throw new Error('parquet expected Uint8Array value') + writer.appendBytes(value) + } +} diff --git a/src/unconvert.js b/src/unconvert.js index 6718439..219aeb2 100644 --- a/src/unconvert.js +++ b/src/unconvert.js @@ -4,19 +4,19 @@ const dayMillis = 86400000 // 1 day in milliseconds * Convert from rich to primitive types. * * @import {DecodedArray, SchemaElement} from 'hyparquet' - * @param {SchemaElement} schemaElement + * @param {SchemaElement} element * @param {DecodedArray} values * @returns {DecodedArray} */ -export function unconvert(schemaElement, values) { - const ctype = schemaElement.converted_type +export function unconvert(element, values) { + const ctype = element.converted_type if (ctype === 'DECIMAL') { - const scale = schemaElement.scale || 0 + const scale = element.scale || 0 const factor = 10 ** scale return values.map(v => { if (v === null || v === undefined) return v if (typeof v !== 'number') throw new Error('DECIMAL must be a number') - return unconvertDecimal(BigInt(Math.round(v * factor))) // to byte array + return unconvertDecimal(element, BigInt(Math.round(v * factor))) }) } if (ctype === 'DATE') { @@ -45,12 +45,12 @@ export function unconvert(schemaElement, values) { * Uncovert from rich type to byte array for metadata statistics. * * @param {import('hyparquet/src/types.js').MinMaxType | undefined} value - * @param {SchemaElement} schema + * @param {SchemaElement} element * @returns {Uint8Array | undefined} */ -export function unconvertMetadata(value, schema) { +export function unconvertMetadata(value, element) { if (value === undefined || value === null) return undefined - const { type, converted_type } = schema + const { type, converted_type } = element if (type === 'BOOLEAN') return new Uint8Array([value ? 1 : 0]) if (type === 'BYTE_ARRAY' || type === 'FIXED_LEN_BYTE_ARRAY') { // truncate byte arrays to 16 bytes for statistics @@ -91,25 +91,34 @@ export function unconvertMetadata(value, schema) { } /** + * @param {SchemaElement} element * @param {bigint} value - * @returns {Uint8Array} + * @returns {number | bigint | Uint8Array} */ -export function unconvertDecimal(value) { - if (value === 0n) return new Uint8Array([]) - const bytes = [] - let current = value +export function unconvertDecimal({ type, type_length }, value) { + if (type === 'INT32') return Number(value) + if (type === 'INT64') return value + if (type === 'FIXED_LEN_BYTE_ARRAY' && !type_length) { + throw new Error('fixed length byte array type_length is required') + } + if (!type_length && !value) return new Uint8Array() + const bytes = [] while (true) { // extract the lowest 8 bits - const byte = Number(current & 0xffn) + const byte = Number(value & 0xffn) bytes.unshift(byte) - current >>= 8n + value >>= 8n - // for nonnegative: stop when top byte has signBit = 0 AND shifted value == 0n - // for negative: stop when top byte has signBit = 1 AND shifted value == -1n - const signBit = byte & 0x80 - if (!signBit && current === 0n || signBit && current === -1n) { - break + if (type_length) { + if (bytes.length >= type_length) break // fixed length + } else { + // for nonnegative: stop when top byte has signBit = 0 AND shifted value == 0n + // for negative: stop when top byte has signBit = 1 AND shifted value == -1n + const signBit = byte & 0x80 + if (!signBit && value === 0n || signBit && value === -1n) { + break + } } } diff --git a/test/unconvert.test.js b/test/unconvert.test.js index 581b4e9..cf3ca92 100644 --- a/test/unconvert.test.js +++ b/test/unconvert.test.js @@ -166,21 +166,41 @@ describe('unconvertDecimal', () => { { input: 1234567890123456789n, expected: new Uint8Array([0x11, 0x22, 0x10, 0xf4, 0x7d, 0xe9, 0x81, 0x15]) }, { input: -1234567890123456789n, expected: new Uint8Array([0xee, 0xdd, 0xef, 0x0b, 0x82, 0x16, 0x7e, 0xeb]) }, ] + /** @type {SchemaElement} */ + const element = { + name: 'col', + type: 'BYTE_ARRAY', + } it.for(examples)('should convert %p', ({ input, expected }) => { expect(parseDecimal(expected)).toEqual(input) }) it.for(examples)('should unconvert %p', ({ input, expected }) => { - expect(unconvertDecimal(input)).toEqual(expected) + expect(unconvertDecimal(element, input)).toEqual(expected) }) it.for(examples)('should roundtrip %p', ({ input }) => { - expect(parseDecimal(unconvertDecimal(input))).toEqual(input) + const byteArray = unconvertDecimal(element, input) + if (!(byteArray instanceof Uint8Array)) throw new Error('expected Uint8Array') + expect(parseDecimal(byteArray)).toEqual(input) }) it.for(examples)('should reverse roundtrip %p', ({ expected }) => { - expect(unconvertDecimal(parseDecimal(expected))).toEqual(expected) + expect(unconvertDecimal(element, parseDecimal(expected))).toEqual(expected) + }) + + it('convert to INT32', () => { + expect(unconvertDecimal({ name: 'col', type: 'INT32' }, 1234n)).toEqual(1234) + }) + + it('convert to INT64', () => { + expect(unconvertDecimal({ name: 'col', type: 'INT64' }, 1234n)).toEqual(1234n) + }) + + it('throws if fixed length is not specified', () => { + expect(() => unconvertDecimal({ name: 'col', type: 'FIXED_LEN_BYTE_ARRAY' }, 1234n)) + .toThrow('fixed length byte array type_length is required') }) })