From c83aa2ea5be75a8cfe8c6875f4d6f4b7989a73ba Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Mon, 13 May 2024 09:22:55 -0700 Subject: [PATCH] Float16 --- src/convert.js | 19 +++++ src/metadata.js | 17 ++-- src/types.d.ts | 2 +- test/convert.test.js | 32 ++++++- test/files/float16_nonzeros_and_nans.json | 10 +++ .../float16_nonzeros_and_nans.metadata.json | 78 ++++++++++++++++++ test/files/float16_nonzeros_and_nans.parquet | Bin 0 -> 501 bytes test/readFiles.test.js | 3 +- 8 files changed, 153 insertions(+), 8 deletions(-) create mode 100644 test/files/float16_nonzeros_and_nans.json create mode 100644 test/files/float16_nonzeros_and_nans.metadata.json create mode 100644 test/files/float16_nonzeros_and_nans.parquet diff --git a/src/convert.js b/src/convert.js index 188be41..ad9e1e0 100644 --- a/src/convert.js +++ b/src/convert.js @@ -45,6 +45,10 @@ export function convert(data, schemaElement) { if (ctype === 'INTERVAL') { throw new Error('parquet interval not supported') } + const logicalType = schemaElement.logical_type?.type + if (logicalType === 'FLOAT16') { + return Array.from(data).map(parseFloat16) + } return data } @@ -71,3 +75,18 @@ function parseInt96Date(value) { const millis = days * dayMillis + nano return new Date(millis) } + +/** + * @param {Uint8Array | undefined} bytes + * @returns {number | undefined} + */ +export function parseFloat16(bytes) { + if (!bytes) return undefined + const int16 = (bytes[1] << 8) | bytes[0] + const sign = int16 >> 15 ? -1 : 1 + const exp = (int16 >> 10) & 0x1f + const frac = int16 & 0x3ff + if (exp === 0) return sign * Math.pow(2, -14) * (frac / 1024) // subnormals + if (exp === 0x1f) return frac ? NaN : sign * Infinity + return sign * Math.pow(2, exp - 15) * (1 + frac / 1024) +} diff --git a/src/metadata.js b/src/metadata.js index 03e8d1f..46931c0 100644 --- a/src/metadata.js +++ b/src/metadata.js @@ -1,4 +1,5 @@ import { CompressionCodec, ConvertedType, Encoding, FieldRepetitionType, ParquetType } from './constants.js' +import { parseFloat16 } from './convert.js' import { getSchemaPath } from './schema.js' import { deserializeTCompactProtocol } from './thrift.js' @@ -24,6 +25,7 @@ import { deserializeTCompactProtocol } from './thrift.js' * * @typedef {import("./types.d.ts").AsyncBuffer} AsyncBuffer * @typedef {import("./types.d.ts").FileMetaData} FileMetaData + * @typedef {import("./types.d.ts").SchemaElement} SchemaElement * @param {AsyncBuffer} asyncBuffer parquet file contents * @param {number} initialFetchSize initial fetch size in bytes * @returns {Promise} parquet metadata object @@ -103,6 +105,7 @@ export function parquetMetadata(arrayBuffer) { // Parse metadata from thrift data const version = metadata.field_1 + /** @type {SchemaElement[]} */ const schema = metadata.field_2.map((/** @type {any} */ field) => ({ type: ParquetType[field.field_1], type_length: field.field_2, @@ -115,8 +118,8 @@ export function parquetMetadata(arrayBuffer) { field_id: field.field_9, logical_type: logicalType(field.field_10), })) - // @ts-expect-error get types by column index - const columnTypes = schema.map(e => e.type).filter(e => e) + // schema element per column index + const columnSchema = schema.filter(e => e.type) const num_rows = metadata.field_3 const row_groups = metadata.field_4.map((/** @type {any} */ rowGroup) => ({ columns: rowGroup.field_1.map((/** @type {any} */ column, /** @type {number} */ columnIndex) => ({ @@ -134,7 +137,7 @@ export function parquetMetadata(arrayBuffer) { data_page_offset: column.field_3.field_9, index_page_offset: column.field_3.field_10, dictionary_page_offset: column.field_3.field_11, - statistics: columnStats(column.field_3.field_12, columnTypes[columnIndex]), + statistics: columnStats(column.field_3.field_12, columnSchema[columnIndex]), encoding_stats: column.field_3.field_13?.map((/** @type {any} */ encodingStat) => ({ page_type: encodingStat.field_1, encoding: Encoding[encodingStat.field_2], @@ -235,10 +238,11 @@ function logicalType(logicalType) { * Convert column statistics based on column type. * * @param {any} stats - * @param {import("./types.d.ts").ParquetType} type + * @param {SchemaElement} schema * @returns {import("./types.d.ts").Statistics} */ -function columnStats(stats, type) { +function columnStats(stats, schema) { + const { type, logical_type } = schema function convert(/** @type {Uint8Array} */ value) { if (value === undefined) return value if (type === 'BOOLEAN') return value[0] === 1 @@ -259,6 +263,9 @@ function columnStats(stats, type) { const view = new DataView(value.buffer, value.byteOffset, value.byteLength) return view.getFloat64(0, true) } + if (logical_type?.type === 'FLOAT16') { + return parseFloat16(value) + } return value } return stats && { diff --git a/src/types.d.ts b/src/types.d.ts index 579be2d..ccfe9a0 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -44,7 +44,7 @@ export interface SchemaElement { scale?: number precision?: number field_id?: number - logicalType?: LogicalType + logical_type?: LogicalType } export type ParquetType = diff --git a/test/convert.test.js b/test/convert.test.js index 5987478..563e0b3 100644 --- a/test/convert.test.js +++ b/test/convert.test.js @@ -1,5 +1,5 @@ import { describe, expect, it } from 'vitest' -import { convert } from '../src/convert.js' +import { convert, parseFloat16 } from '../src/convert.js' /** * @typedef {import('../src/types.js').SchemaElement} SchemaElement @@ -101,3 +101,33 @@ describe('convert function', () => { .toThrow('parquet interval not supported') }) }) + +describe('parseFloat16', () => { + it('should convert numbers', () => { + expect(parseFloat16(new Uint8Array([0x00, 0xbc]))).toBe(-1) + expect(parseFloat16(new Uint8Array([0x00, 0x00]))).toBe(0) + expect(parseFloat16(new Uint8Array([0x00, 0x38]))).toBe(0.5) + expect(parseFloat16(new Uint8Array([0x00, 0x3c]))).toBe(1) + expect(parseFloat16(new Uint8Array([0x00, 0x40]))).toBe(2) + }) + + it('should convert -0', () => { + expect(parseFloat16(new Uint8Array([0x00, 0x80]))).toBe(-0) + expect(parseFloat16(new Uint8Array([0x00, 0x80]))).not.toBe(0) + }) + + it('should convert Infinity', () => { + expect(parseFloat16(new Uint8Array([0x00, 0x7c]))).toBe(Infinity) + expect(parseFloat16(new Uint8Array([0x00, 0xfc]))).toBe(-Infinity) + }) + + it('should convert NaN', () => { + expect(parseFloat16(new Uint8Array([0x00, 0x7e]))).toBeNaN() + expect(parseFloat16(new Uint8Array([0x01, 0x7e]))).toBeNaN() + }) + + it('should convert a subnormal number', () => { + expect(parseFloat16(new Uint8Array([0xff, 0x03]))) + .toBeCloseTo(Math.pow(2, -14) * (1023 / 1024), 5) + }) +}) diff --git a/test/files/float16_nonzeros_and_nans.json b/test/files/float16_nonzeros_and_nans.json new file mode 100644 index 0000000..ee7d17c --- /dev/null +++ b/test/files/float16_nonzeros_and_nans.json @@ -0,0 +1,10 @@ +[ + [null], + [1], + [-2], + [null], + [0], + [-1], + [0], + [2] +] diff --git a/test/files/float16_nonzeros_and_nans.metadata.json b/test/files/float16_nonzeros_and_nans.metadata.json new file mode 100644 index 0000000..82b6fd9 --- /dev/null +++ b/test/files/float16_nonzeros_and_nans.metadata.json @@ -0,0 +1,78 @@ +{ + "version": 2, + "schema": [ + { + "repetition_type": "REQUIRED", + "name": "schema", + "num_children": 1 + }, + { + "type": "FIXED_LEN_BYTE_ARRAY", + "type_length": 2, + "repetition_type": "OPTIONAL", + "name": "x", + "logical_type": { + "type": "FLOAT16" + } + } + ], + "num_rows": 8, + "row_groups": [ + { + "columns": [ + { + "file_offset": 80, + "meta_data": { + "type": "FIXED_LEN_BYTE_ARRAY", + "encodings": [ + "PLAIN", + "RLE", + "RLE_DICTIONARY" + ], + "path_in_schema": [ + "x" + ], + "codec": "UNCOMPRESSED", + "num_values": 8, + "total_uncompressed_size": 76, + "total_compressed_size": 76, + "data_page_offset": 32, + "dictionary_page_offset": 4, + "statistics": { + "max": 2, + "min": -2, + "null_count": 1, + "max_value": 2, + "min_value": -2 + }, + "encoding_stats": [ + { + "page_type": 2, + "encoding": "PLAIN", + "count": 1 + }, + { + "page_type": 0, + "encoding": "RLE_DICTIONARY", + "count": 1 + } + ] + } + } + ], + "total_byte_size": 76, + "num_rows": 8, + "file_offset": 4, + "total_compressed_size": 76, + "ordinal": 0 + } + ], + "key_value_metadata": [ + { + "key": "ARROW:schema", + "value": "/////3AAAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAEAAAAUAAAAEAAUAAgABgAHAAwAAAAQABAAAAAAAAEDEAAAABgAAAAEAAAAAAAAAAEAAAB4AAAABAAEAAQAAAAAAAAA" + } + ], + "created_by": "parquet-cpp-arrow version 15.0.0-SNAPSHOT", + "metadata_length": 346 +} diff --git a/test/files/float16_nonzeros_and_nans.parquet b/test/files/float16_nonzeros_and_nans.parquet new file mode 100644 index 0000000000000000000000000000000000000000..6a0fc412d4c527fa26f518fa848ddf82ed5aa0d5 GIT binary patch literal 501 zcmb_ZJx>Bb5S=^7g~E`FaW1!J3#yAIfDolI7R(_A;|E_F6FWl4X`+CKij}eSCs^`B zEU5fd#`yMt&eFHL^WM(9-Pzo=q*LZJYu2ZnWm+eq1Dew-_>F( z6>IL0!QY2!`uV3jd_9kVd6v)NaQjP3iphL z75UXuM9AMB_u_}A!DFF69v6CXJeua72l3=~G|ZJN#Zs|U=$=cf+h|@Wt?}uuEut2F F_%B@SVg&#I literal 0 HcmV?d00001 diff --git a/test/readFiles.test.js b/test/readFiles.test.js index c704009..93b3253 100644 --- a/test/readFiles.test.js +++ b/test/readFiles.test.js @@ -27,7 +27,8 @@ describe('parquetRead test files', () => { onComplete: (rows) => { const base = filename.replace('.parquet', '') const expected = fileToJson(`test/files/${base}.json`) - expect(toJson(rows)).toEqual(expected) + // stringify and parse to make legal json + expect(JSON.parse(JSON.stringify(toJson(rows)))).toEqual(expected) }, }) })