From 4bf1595981925b12e482e91141b4c911fd5a3604 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Fri, 28 Mar 2025 16:13:27 -0700 Subject: [PATCH] Allow specifying column type --- README.md | 4 ++-- src/schema.js | 5 ++--- src/types.d.ts | 1 + src/write.js | 4 ++-- test/write.test.js | 26 ++++++++++++++++++++++++++ 5 files changed, 33 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 9594b93..b942a4a 100644 --- a/README.md +++ b/README.md @@ -16,8 +16,8 @@ import { parquetWrite } from 'hyparquet-writer' const arrayBuffer = parquetWrite({ columnData: [ - { name: 'name', data: ['Alice', 'Bob', 'Charlie'] }, - { name: 'age', data: [25, 30, 35] }, + { name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'STRING' }, + { name: 'age', data: [25, 30, 35], type: 'INT32' }, ], }) ``` diff --git a/src/schema.js b/src/schema.js index 62e5859..62f77e8 100644 --- a/src/schema.js +++ b/src/schema.js @@ -5,15 +5,14 @@ * @import {ConvertedType, DecodedArray, FieldRepetitionType, ParquetType, SchemaElement} from 'hyparquet' * @param {string} name * @param {DecodedArray} values + * @param {ParquetType | undefined} type * @returns {SchemaElement} */ -export function getSchemaElementForValues(name, values) { +export function getSchemaElementForValues(name, values, type) { if (values instanceof Int32Array) return { name, type: 'INT32', repetition_type: 'REQUIRED' } if (values instanceof BigInt64Array) return { name, type: 'INT64', repetition_type: 'REQUIRED' } if (values instanceof Float32Array) return { name, type: 'FLOAT', repetition_type: 'REQUIRED' } if (values instanceof Float64Array) return { name, type: 'DOUBLE', repetition_type: 'REQUIRED' } - /** @type {ParquetType | undefined} */ - let type = undefined /** @type {FieldRepetitionType} */ let repetition_type = 'REQUIRED' /** @type {ConvertedType | undefined} */ diff --git a/src/types.d.ts b/src/types.d.ts index f10648b..793dd90 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -3,6 +3,7 @@ import { DecodedArray, ParquetType } from "hyparquet" export interface ColumnData { name: string data: DecodedArray + type?: ParquetType } export interface Writer { diff --git a/src/write.js b/src/write.js index ba8ad2e..04a52dc 100644 --- a/src/write.js +++ b/src/write.js @@ -39,9 +39,9 @@ export function parquetWrite({ columnData, compressed = true }) { const columns = [] // Write columns - for (const { name, data } of columnData) { + for (const { name, data, type } of columnData) { // auto-detect type - const schemaElement = getSchemaElementForValues(name, data) + const schemaElement = getSchemaElementForValues(name, data, type) if (!schemaElement.type) throw new Error(`column ${name} cannot determine type`) const file_offset = BigInt(writer.offset) /** @type {SchemaElement[]} */ diff --git a/test/write.test.js b/test/write.test.js index d00dabb..fa40b12 100644 --- a/test/write.test.js +++ b/test/write.test.js @@ -136,6 +136,32 @@ describe('parquetWrite', () => { ]) }) + it('serializes empty column', async () => { + const result = await roundTripDeserialize([{ + name: 'empty', + data: [null, null, null, null], + type: 'BOOLEAN', + }]) + expect(result).toEqual([ + { empty: null }, + { empty: null }, + { empty: null }, + { empty: null }, + ]) + }) + + it('throws for wrong type specified', () => { + expect(() => parquetWrite({ columnData: [{ name: 'int', data: [1, 2, 3], type: 'BOOLEAN' }] })) + .toThrow('parquet cannot write mixed types') + }) + + it('throws for empty column with no type specified', () => { + expect(() => parquetWrite({ columnData: [{ name: 'empty', data: [] }] })) + .toThrow('column empty cannot determine type') + expect(() => parquetWrite({ columnData: [{ name: 'empty', data: [null, null, null, null] }] })) + .toThrow('column empty cannot determine type') + }) + it('throws for mixed types', () => { expect(() => parquetWrite({ columnData: [{ name: 'mixed', data: [1, 2, 3, 'boom'] }] })) .toThrow('mixed types not supported')