Option for key_value_metadata

This commit is contained in:
Kenny Daniel 2025-03-31 13:42:57 -07:00
parent 07928e8eb7
commit 7be21caa9e
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
6 changed files with 42 additions and 13 deletions

@ -4,7 +4,7 @@
[![minzipped](https://img.shields.io/bundlephobia/minzip/hyparquet-writer)](https://www.npmjs.com/package/hyparquet-writer)
[![workflow status](https://github.com/hyparam/hyparquet-writer/actions/workflows/ci.yml/badge.svg)](https://github.com/hyparam/hyparquet-writer/actions)
[![mit license](https://img.shields.io/badge/License-MIT-orange.svg)](https://opensource.org/licenses/MIT)
![coverage](https://img.shields.io/badge/Coverage-97-darkred)
![coverage](https://img.shields.io/badge/Coverage-96-darkred)
[![dependencies](https://img.shields.io/badge/Dependencies-0-blueviolet)](https://www.npmjs.com/package/hyparquet?activeTab=dependencies)
## Usage

@ -42,11 +42,11 @@
"devDependencies": {
"@babel/eslint-parser": "7.27.0",
"@types/node": "22.13.14",
"@vitest/coverage-v8": "3.0.9",
"@vitest/coverage-v8": "3.1.1",
"eslint": "9.23.0",
"eslint-plugin-jsdoc": "50.6.9",
"hyparquet": "1.10.1",
"typescript": "5.8.2",
"vitest": "3.0.9"
"vitest": "3.1.1"
}
}

@ -1,4 +1,4 @@
import { ConvertedType, Encoding, FieldRepetitionType, ParquetType } from 'hyparquet/src/constants.js'
import { ConvertedType, Encoding, FieldRepetitionType, PageType, ParquetType } from 'hyparquet/src/constants.js'
import { serializeTCompactProtocol } from './thrift.js'
const CompressionCodec = [
@ -51,10 +51,18 @@ export function writeMetadata(writer, metadata) {
field_10: c.meta_data.index_page_offset,
field_11: c.meta_data.dictionary_page_offset,
field_12: c.meta_data.statistics,
field_13: c.meta_data.encoding_stats,
field_13: c.meta_data.encoding_stats && c.meta_data.encoding_stats.map(es => ({
field_1: PageType.indexOf(es.page_type),
field_2: Encoding.indexOf(es.encoding),
field_3: es.count,
})),
field_14: c.meta_data.bloom_filter_offset,
field_15: c.meta_data.bloom_filter_length,
field_16: c.meta_data.size_statistics,
field_16: c.meta_data.size_statistics && {
field_1: c.meta_data.size_statistics.unencoded_byte_array_data_bytes,
field_2: c.meta_data.size_statistics.repetition_level_histogram,
field_3: c.meta_data.size_statistics.definition_level_histogram,
},
},
field_4: c.offset_index_offset,
field_5: c.offset_index_length,
@ -65,12 +73,19 @@ export function writeMetadata(writer, metadata) {
})),
field_2: rg.total_byte_size,
field_3: rg.num_rows,
field_4: rg.sorting_columns,
field_4: rg.sorting_columns && rg.sorting_columns.map(sc => ({
field_1: sc.column_idx,
field_2: sc.descending,
field_3: sc.nulls_first,
})),
field_5: rg.file_offset,
field_6: rg.total_compressed_size,
field_7: rg.ordinal,
})),
field_5: metadata.key_value_metadata,
field_5: metadata.key_value_metadata && metadata.key_value_metadata.map(kv => ({
field_1: kv.key,
field_2: kv.value,
})),
field_6: metadata.created_by,
}

5
src/types.d.ts vendored

@ -6,6 +6,11 @@ export interface ColumnData {
type?: ParquetType
}
export interface KeyValue {
key: string
value?: string
}
export interface Writer {
buffer: ArrayBuffer
offset: number

@ -7,13 +7,14 @@ import { getSchemaElementForValues } from './schema.js'
* Write data as parquet to an ArrayBuffer
*
* @import {ColumnChunk, DecodedArray, FileMetaData, SchemaElement, SchemaTree} from 'hyparquet'
* @import {ColumnData} from '../src/types.js'
* @import {ColumnData, KeyValue} from '../src/types.js'
* @param {object} options
* @param {ColumnData[]} options.columnData
* @param {boolean} [options.compressed]
* @param {KeyValue[]} [options.kvMetadata]
* @returns {ArrayBuffer}
*/
export function parquetWrite({ columnData, compressed = true }) {
export function parquetWrite({ columnData, compressed = true, kvMetadata }) {
const writer = new Writer()
// Check if all columns have the same length
@ -73,6 +74,7 @@ export function parquetWrite({ columnData, compressed = true }) {
num_rows,
}],
metadata_length: 0,
key_value_metadata: kvMetadata,
}
// @ts-ignore don't want to actually serialize metadata_length
delete metadata.metadata_length

@ -121,8 +121,15 @@ describe('writeMetadata', () => {
writer.appendUint32(0x31524150)
// Write metadata
/** @type {FileMetaData} */
writeMetadata(writer, exampleMetadata)
const withKvMetadata = {
...exampleMetadata,
key_value_metadata: [
{ key: 'key1', value: 'value1' },
{ key: 'key2', value: 'value2' },
],
metadata_length: 370,
}
writeMetadata(writer, withKvMetadata)
// Write footer PAR1
writer.appendUint32(0x31524150)
@ -131,7 +138,7 @@ describe('writeMetadata', () => {
const output = parquetMetadata(file)
/** @type {FileMetaData} */
expect(output).toEqual(exampleMetadata)
expect(output).toEqual(withKvMetadata)
})
})