From 6545196a1d4044e2885db367bfa0ddde4bbb6d61 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Tue, 8 Apr 2025 04:20:32 -0700 Subject: [PATCH] Float life --- src/bytewriter.js | 9 +++++++++ src/plain.js | 12 ++++++++++++ src/schema.js | 9 +++++++-- src/types.d.ts | 1 + test/metadata.test.js | 40 +++++++++++++++++++++++++++++---------- test/plain.test.js | 18 ++++++++++++++++++ test/write.buffer.test.js | 14 +++++++------- test/write.file.test.js | 8 ++++---- 8 files changed, 88 insertions(+), 23 deletions(-) diff --git a/src/bytewriter.js b/src/bytewriter.js index d4ab57d..6850ce0 100644 --- a/src/bytewriter.js +++ b/src/bytewriter.js @@ -71,6 +71,15 @@ ByteWriter.prototype.appendInt64 = function(value) { this.offset += 8 } +/** + * @param {number} value + */ +ByteWriter.prototype.appendFloat32 = function(value) { + this.ensure(this.offset + 8) + this.view.setFloat32(this.offset, value, true) + this.offset += 4 +} + /** * @param {number} value */ diff --git a/src/plain.js b/src/plain.js index 6cfdd1f..acaf441 100644 --- a/src/plain.js +++ b/src/plain.js @@ -13,6 +13,8 @@ export function writePlain(writer, values, type) { writePlainInt32(writer, values) } else if (type === 'INT64') { writePlainInt64(writer, values) + } else if (type === 'FLOAT') { + writePlainFloat(writer, values) } else if (type === 'DOUBLE') { writePlainDouble(writer, values) } else if (type === 'BYTE_ARRAY') { @@ -69,6 +71,16 @@ function writePlainInt64(writer, values) { } } +/** + * @param {Writer} writer + * @param {DecodedArray} values + */ +function writePlainFloat(writer, values) { + for (const value of values) { + writer.appendFloat32(value) + } +} + /** * @param {Writer} writer * @param {DecodedArray} values diff --git a/src/schema.js b/src/schema.js index 99147d3..279d8bb 100644 --- a/src/schema.js +++ b/src/schema.js @@ -55,9 +55,14 @@ export function getSchemaElementForValues(name, values, type) { type = valueType } else if (type === 'INT32' && valueType === 'DOUBLE') { type = 'DOUBLE' + } else if (type === 'FLOAT' && valueType === 'INT32') { + valueType = 'FLOAT' + } else if (type === 'FLOAT' && valueType === 'DOUBLE') { + valueType = 'FLOAT' } else if (type === 'DOUBLE' && valueType === 'INT32') { - // keep - } else if (type !== valueType) { + valueType = 'DOUBLE' + } + if (type !== valueType) { throw new Error(`parquet cannot write mixed types: ${type} and ${valueType}`) } } diff --git a/src/types.d.ts b/src/types.d.ts index 5a3eef0..776da78 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -27,6 +27,7 @@ export interface Writer { appendUint32(value: number): void appendInt32(value: number): void appendInt64(value: bigint): void + appendFloat32(value: number): void appendFloat64(value: number): void appendBuffer(buffer: ArrayBuffer): void appendBytes(value: Uint8Array): void diff --git a/test/metadata.test.js b/test/metadata.test.js index 92648ae..3b08ca4 100644 --- a/test/metadata.test.js +++ b/test/metadata.test.js @@ -11,10 +11,11 @@ export const exampleMetadata = { version: 2, created_by: 'hyparquet', schema: [ - { name: 'root', num_children: 6 }, + { name: 'root', num_children: 7 }, { name: 'bool', type: 'BOOLEAN', repetition_type: 'REQUIRED' }, { name: 'int', type: 'INT32', repetition_type: 'REQUIRED' }, { name: 'bigint', type: 'INT64', repetition_type: 'REQUIRED' }, + { name: 'float', type: 'FLOAT', repetition_type: 'REQUIRED' }, { name: 'double', type: 'DOUBLE', repetition_type: 'REQUIRED' }, { name: 'string', type: 'BYTE_ARRAY', repetition_type: 'REQUIRED', converted_type: 'UTF8' }, { name: 'nullable', type: 'BOOLEAN', repetition_type: 'OPTIONAL' }, @@ -80,8 +81,27 @@ export const exampleMetadata = { }, }, { - file_path: 'double', + file_path: 'float', file_offset: 110n, + meta_data: { + type: 'FLOAT', + encodings: ['PLAIN'], + path_in_schema: ['float'], + codec: 'SNAPPY', + num_values: 4n, + total_uncompressed_size: 39n, + total_compressed_size: 39n, + data_page_offset: 110n, + statistics: { + null_count: 0n, + min_value: 0, + max_value: Infinity, + }, + }, + }, + { + file_path: 'double', + file_offset: 149n, meta_data: { type: 'DOUBLE', encodings: ['PLAIN'], @@ -90,7 +110,7 @@ export const exampleMetadata = { num_values: 4n, total_uncompressed_size: 51n, total_compressed_size: 51n, - data_page_offset: 110n, + data_page_offset: 149n, statistics: { null_count: 0n, min_value: 0, @@ -100,7 +120,7 @@ export const exampleMetadata = { }, { file_path: 'string', - file_offset: 161n, + file_offset: 200n, meta_data: { type: 'BYTE_ARRAY', encodings: ['PLAIN'], @@ -109,7 +129,7 @@ export const exampleMetadata = { num_values: 4n, total_uncompressed_size: 42n, total_compressed_size: 42n, - data_page_offset: 161n, + data_page_offset: 200n, statistics: { null_count: 0n, min_value: 'a', @@ -119,7 +139,7 @@ export const exampleMetadata = { }, { file_path: 'nullable', - file_offset: 203n, + file_offset: 242n, meta_data: { type: 'BOOLEAN', encodings: ['PLAIN'], @@ -128,7 +148,7 @@ export const exampleMetadata = { num_values: 4n, total_uncompressed_size: 26n, total_compressed_size: 26n, - data_page_offset: 203n, + data_page_offset: 242n, statistics: { null_count: 2n, min_value: false, @@ -137,10 +157,10 @@ export const exampleMetadata = { }, }, ], - total_byte_size: 225n, + total_byte_size: 264n, num_rows: 4n, }], - metadata_length: 432, + metadata_length: 497, } describe('writeMetadata', () => { @@ -158,7 +178,7 @@ describe('writeMetadata', () => { { key: 'key1', value: 'value1' }, { key: 'key2', value: 'value2' }, ], - metadata_length: 464, + metadata_length: 529, } writeMetadata(writer, withKvMetadata) diff --git a/test/plain.test.js b/test/plain.test.js index 96719bf..8ce90b4 100644 --- a/test/plain.test.js +++ b/test/plain.test.js @@ -41,6 +41,24 @@ describe('writePlain', () => { } }) + it('writes FLOAT', () => { + const writer = new ByteWriter() + const floats = [0, 300.5, -2.7100000381469727, Infinity, -Infinity, NaN] + writePlain(writer, floats, 'FLOAT') + + // 4 bytes per float + expect(writer.offset).toBe(4 * floats.length) + + for (let i = 0; i < floats.length; i++) { + const val = writer.view.getFloat32(i * 4, true) + if (Number.isNaN(floats[i])) { + expect(Number.isNaN(val)).toBe(true) + } else { + expect(val).toBe(floats[i]) + } + } + }) + it('writes DOUBLE', () => { const writer = new ByteWriter() const doubles = [0, 3.14, -2.71, Infinity, -Infinity, NaN] diff --git a/test/write.buffer.test.js b/test/write.buffer.test.js index ce044e4..68ea11c 100644 --- a/test/write.buffer.test.js +++ b/test/write.buffer.test.js @@ -20,7 +20,7 @@ export const basicData = [ { name: 'bool', data: [true, false, true, false] }, { name: 'int', data: [0, 127, 0x7fff, 0x7fffffff] }, { name: 'bigint', data: [0n, 127n, 0x7fffn, 0x7fffffffffffffffn] }, - // { name: 'float', data: [0, 0.0001, 123.456, 1e100], type: 'FLOAT' }, // TODO + { name: 'float', data: [0, 0.0001, 123.456, 1e100], type: 'FLOAT' }, { name: 'double', data: [0, 0.0001, 123.456, 1e100] }, { name: 'string', data: ['a', 'b', 'c', 'd'] }, { name: 'nullable', data: [true, false, null, null] }, @@ -36,10 +36,10 @@ describe('parquetWriteBuffer', () => { it('serializes basic types', async () => { const result = await roundTripDeserialize(basicData) expect(result).toEqual([ - { bool: true, int: 0, bigint: 0n, double: 0, string: 'a', nullable: true }, - { bool: false, int: 127, bigint: 127n, double: 0.0001, string: 'b', nullable: false }, - { bool: true, int: 0x7fff, bigint: 0x7fffn, double: 123.456, string: 'c', nullable: null }, - { bool: false, int: 0x7fffffff, bigint: 0x7fffffffffffffffn, double: 1e100, string: 'd', nullable: null }, + { bool: true, int: 0, bigint: 0n, float: 0, double: 0, string: 'a', nullable: true }, + { bool: false, int: 127, bigint: 127n, float: 0.00009999999747378752, double: 0.0001, string: 'b', nullable: false }, + { bool: true, int: 0x7fff, bigint: 0x7fffn, float: 123.45600128173828, double: 123.456, string: 'c', nullable: null }, + { bool: false, int: 0x7fffffff, bigint: 0x7fffffffffffffffn, float: Infinity, double: 1e100, string: 'd', nullable: null }, ]) }) @@ -92,8 +92,8 @@ describe('parquetWriteBuffer', () => { it('writes statistics when enabled', () => { const withStats = parquetWriteBuffer({ columnData: basicData, statistics: true }) const noStats = parquetWriteBuffer({ columnData: basicData, statistics: false }) - expect(withStats.byteLength).toBe(669) - expect(noStats.byteLength).toBe(575) + expect(withStats.byteLength).toBe(773) + expect(noStats.byteLength).toBe(663) }) it('serializes list types', async () => { diff --git a/test/write.file.test.js b/test/write.file.test.js index d13d914..b9f853d 100644 --- a/test/write.file.test.js +++ b/test/write.file.test.js @@ -34,10 +34,10 @@ describe('parquetWrite with FileWriter', () => { // check parquet data const result = await parquetReadObjects({ file, metadata }) expect(result).toEqual([ - { bool: true, int: 0, bigint: 0n, double: 0, string: 'a', nullable: true }, - { bool: false, int: 127, bigint: 127n, double: 0.0001, string: 'b', nullable: false }, - { bool: true, int: 0x7fff, bigint: 0x7fffn, double: 123.456, string: 'c', nullable: null }, - { bool: false, int: 0x7fffffff, bigint: 0x7fffffffffffffffn, double: 1e100, string: 'd', nullable: null }, + { bool: true, int: 0, bigint: 0n, float: 0, double: 0, string: 'a', nullable: true }, + { bool: false, int: 127, bigint: 127n, float: 0.00009999999747378752, double: 0.0001, string: 'b', nullable: false }, + { bool: true, int: 0x7fff, bigint: 0x7fffn, float: 123.45600128173828, double: 123.456, string: 'c', nullable: null }, + { bool: false, int: 0x7fffffff, bigint: 0x7fffffffffffffffn, float: Infinity, double: 1e100, string: 'd', nullable: null }, ]) }) })