From 7be21caa9e09d982cb855f44e4aac908524d8b61 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Mon, 31 Mar 2025 13:42:57 -0700 Subject: [PATCH] Option for key_value_metadata --- README.md | 2 +- package.json | 4 ++-- src/metadata.js | 25 ++++++++++++++++++++----- src/types.d.ts | 5 +++++ src/write.js | 6 ++++-- test/metadata.test.js | 13 ++++++++++--- 6 files changed, 42 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 6a056fb..b942a4a 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![minzipped](https://img.shields.io/bundlephobia/minzip/hyparquet-writer)](https://www.npmjs.com/package/hyparquet-writer) [![workflow status](https://github.com/hyparam/hyparquet-writer/actions/workflows/ci.yml/badge.svg)](https://github.com/hyparam/hyparquet-writer/actions) [![mit license](https://img.shields.io/badge/License-MIT-orange.svg)](https://opensource.org/licenses/MIT) -![coverage](https://img.shields.io/badge/Coverage-97-darkred) +![coverage](https://img.shields.io/badge/Coverage-96-darkred) [![dependencies](https://img.shields.io/badge/Dependencies-0-blueviolet)](https://www.npmjs.com/package/hyparquet?activeTab=dependencies) ## Usage diff --git a/package.json b/package.json index 4eff849..36d1793 100644 --- a/package.json +++ b/package.json @@ -42,11 +42,11 @@ "devDependencies": { "@babel/eslint-parser": "7.27.0", "@types/node": "22.13.14", - "@vitest/coverage-v8": "3.0.9", + "@vitest/coverage-v8": "3.1.1", "eslint": "9.23.0", "eslint-plugin-jsdoc": "50.6.9", "hyparquet": "1.10.1", "typescript": "5.8.2", - "vitest": "3.0.9" + "vitest": "3.1.1" } } diff --git a/src/metadata.js b/src/metadata.js index c265594..cd00041 100644 --- a/src/metadata.js +++ b/src/metadata.js @@ -1,4 +1,4 @@ -import { ConvertedType, Encoding, FieldRepetitionType, ParquetType } from 'hyparquet/src/constants.js' +import { ConvertedType, Encoding, FieldRepetitionType, PageType, ParquetType } from 'hyparquet/src/constants.js' import { serializeTCompactProtocol } from './thrift.js' const CompressionCodec = [ @@ -51,10 +51,18 @@ export function writeMetadata(writer, metadata) { field_10: c.meta_data.index_page_offset, field_11: c.meta_data.dictionary_page_offset, field_12: c.meta_data.statistics, - field_13: c.meta_data.encoding_stats, + field_13: c.meta_data.encoding_stats && c.meta_data.encoding_stats.map(es => ({ + field_1: PageType.indexOf(es.page_type), + field_2: Encoding.indexOf(es.encoding), + field_3: es.count, + })), field_14: c.meta_data.bloom_filter_offset, field_15: c.meta_data.bloom_filter_length, - field_16: c.meta_data.size_statistics, + field_16: c.meta_data.size_statistics && { + field_1: c.meta_data.size_statistics.unencoded_byte_array_data_bytes, + field_2: c.meta_data.size_statistics.repetition_level_histogram, + field_3: c.meta_data.size_statistics.definition_level_histogram, + }, }, field_4: c.offset_index_offset, field_5: c.offset_index_length, @@ -65,12 +73,19 @@ export function writeMetadata(writer, metadata) { })), field_2: rg.total_byte_size, field_3: rg.num_rows, - field_4: rg.sorting_columns, + field_4: rg.sorting_columns && rg.sorting_columns.map(sc => ({ + field_1: sc.column_idx, + field_2: sc.descending, + field_3: sc.nulls_first, + })), field_5: rg.file_offset, field_6: rg.total_compressed_size, field_7: rg.ordinal, })), - field_5: metadata.key_value_metadata, + field_5: metadata.key_value_metadata && metadata.key_value_metadata.map(kv => ({ + field_1: kv.key, + field_2: kv.value, + })), field_6: metadata.created_by, } diff --git a/src/types.d.ts b/src/types.d.ts index 793dd90..617de1c 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -6,6 +6,11 @@ export interface ColumnData { type?: ParquetType } +export interface KeyValue { + key: string + value?: string +} + export interface Writer { buffer: ArrayBuffer offset: number diff --git a/src/write.js b/src/write.js index b8827de..12bf9ec 100644 --- a/src/write.js +++ b/src/write.js @@ -7,13 +7,14 @@ import { getSchemaElementForValues } from './schema.js' * Write data as parquet to an ArrayBuffer * * @import {ColumnChunk, DecodedArray, FileMetaData, SchemaElement, SchemaTree} from 'hyparquet' - * @import {ColumnData} from '../src/types.js' + * @import {ColumnData, KeyValue} from '../src/types.js' * @param {object} options * @param {ColumnData[]} options.columnData * @param {boolean} [options.compressed] + * @param {KeyValue[]} [options.kvMetadata] * @returns {ArrayBuffer} */ -export function parquetWrite({ columnData, compressed = true }) { +export function parquetWrite({ columnData, compressed = true, kvMetadata }) { const writer = new Writer() // Check if all columns have the same length @@ -73,6 +74,7 @@ export function parquetWrite({ columnData, compressed = true }) { num_rows, }], metadata_length: 0, + key_value_metadata: kvMetadata, } // @ts-ignore don't want to actually serialize metadata_length delete metadata.metadata_length diff --git a/test/metadata.test.js b/test/metadata.test.js index b72e204..1543e78 100644 --- a/test/metadata.test.js +++ b/test/metadata.test.js @@ -121,8 +121,15 @@ describe('writeMetadata', () => { writer.appendUint32(0x31524150) // Write metadata - /** @type {FileMetaData} */ - writeMetadata(writer, exampleMetadata) + const withKvMetadata = { + ...exampleMetadata, + key_value_metadata: [ + { key: 'key1', value: 'value1' }, + { key: 'key2', value: 'value2' }, + ], + metadata_length: 370, + } + writeMetadata(writer, withKvMetadata) // Write footer PAR1 writer.appendUint32(0x31524150) @@ -131,7 +138,7 @@ describe('writeMetadata', () => { const output = parquetMetadata(file) /** @type {FileMetaData} */ - expect(output).toEqual(exampleMetadata) + expect(output).toEqual(withKvMetadata) }) })