From 5957c219bc7aa06f7be9e67cd5a469559e8da1cd Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Sat, 31 May 2025 22:53:24 -0700 Subject: [PATCH] Allow null columns to be auto-typed --- src/schema.js | 7 ++- test/schema.test.js | 123 ++++++++++++++++++++++++++++++++++++++ test/write.buffer.test.js | 7 --- test/write.schema.test.js | 36 +++++++++++ 4 files changed, 165 insertions(+), 8 deletions(-) create mode 100644 test/schema.test.js diff --git a/src/schema.js b/src/schema.js index 2e999ec..bb5c8da 100644 --- a/src/schema.js +++ b/src/schema.js @@ -133,7 +133,12 @@ export function autoSchemaElement(name, values) { } } } - if (!type) throw new Error(`column ${name} cannot determine type`) + if (!type) { + // fallback to nullable BYTE_ARRAY + // TODO: logical_type: 'NULL' + type = 'BYTE_ARRAY' + repetition_type = 'OPTIONAL' + } return { name, type, repetition_type, converted_type } } diff --git a/test/schema.test.js b/test/schema.test.js new file mode 100644 index 0000000..f235493 --- /dev/null +++ b/test/schema.test.js @@ -0,0 +1,123 @@ +import { describe, expect, it } from 'vitest' +import { autoSchemaElement, getMaxDefinitionLevel, getMaxRepetitionLevel, schemaFromColumnData } from '../src/schema.js' + +/** + * @import {SchemaElement} from 'hyparquet' + */ + +describe('schemaFromColumnData', () => { + it('honours provided type with nullable = false → REQUIRED', () => { + const schema = schemaFromColumnData({ + columnData: [ + { name: 'id', data: new Int32Array([1, 2, 3]), type: 'INT32', nullable: false }, + ], + }) + expect(schema[1]).toEqual({ name: 'id', type: 'INT32', repetition_type: 'REQUIRED' }) + }) + + it('applies valid schema override verbatim', () => { + const schema = schemaFromColumnData({ + columnData: [{ name: 'strings', data: ['a', 'b'] }], + schemaOverrides: { + strings: { + name: 'strings', + type: 'BYTE_ARRAY', + converted_type: 'UTF8', + repetition_type: 'OPTIONAL', + }, + }, + }) + expect(schema[1].name).toBe('strings') + expect(schema[1].type).toBe('BYTE_ARRAY') + expect(schema[1].converted_type).toBe('UTF8') + expect(schema[1].repetition_type).toBe('OPTIONAL') + }) + + it('throws when column lengths differ', () => { + expect(() => + schemaFromColumnData({ + columnData: [ + { name: 'a', data: new Int32Array([1]) }, + { name: 'b', data: new Int32Array([1, 2]) }, + ], + }) + ).toThrow(/columns must have the same length/) + }) + + it('rejects override type REPEATED', () => { + expect(() => + schemaFromColumnData({ + columnData: [{ name: 'x', data: new Int32Array([1]) }], + schemaOverrides: { x: { name: 'x', type: 'INT32', repetition_type: 'REPEATED' } }, + }) + ).toThrow(/cannot be repeated/) + }) + + it('rejects override with children', () => { + expect(() => + schemaFromColumnData({ + columnData: [{ name: 'x', data: new Int32Array([1]) }], + schemaOverrides: { x: { name: 'x', type: 'INT32', num_children: 1 } }, + }) + ).toThrow(/cannot have children/) + }) + + it('rejects override with mismatched name', () => { + expect(() => + schemaFromColumnData({ + columnData: [{ name: 'x', data: new Int32Array([1]) }], + schemaOverrides: { x: { name: 'y', type: 'INT32' } }, + }) + ).toThrow(/does not match column name/) + }) +}) + +describe('autoSchemaElement', () => { + it.each([ + [new Int32Array([1, 2]), 'INT32'], + [new BigInt64Array([1n, 2n]), 'INT64'], + [new Float32Array([1, 2]), 'FLOAT'], + [new Float64Array([1, 2]), 'DOUBLE'], + ])('detects typed arrays (%#)', (data, expected) => { + const el = autoSchemaElement('col', data) + expect(el.type).toBe(expected) + expect(el.repetition_type).toBe('REQUIRED') + }) + + it('promotes INT32 + DOUBLE mix to DOUBLE', () => { + const el = autoSchemaElement('mix', [1, 2.5]) + expect(el.type).toBe('DOUBLE') + }) + + it('sets repetition_type OPTIONAL when nulls present', () => { + const el = autoSchemaElement('maybe', [null, 1]) + expect(el.repetition_type).toBe('OPTIONAL') + }) + + it('falls back to BYTE_ARRAY for empty arrays', () => { + const el = autoSchemaElement('empty', []) + expect(el.type).toBe('BYTE_ARRAY') + expect(el.repetition_type).toBe('OPTIONAL') + }) + + it('throws on incompatible mixed scalar types', () => { + expect(() => autoSchemaElement('bad', [1, 'a'])).toThrow(/mixed types/) + }) +}) + +describe('level helpers', () => { + /** @type {SchemaElement[]} */ + const path = [ + { name: 'root', repetition_type: 'REPEATED' }, + { name: 'child', repetition_type: 'OPTIONAL' }, + { name: 'leaf', repetition_type: 'REPEATED' }, + ] + + it('computes max repetition level', () => { + expect(getMaxRepetitionLevel(path)).toBe(2) + }) + + it('computes max definition level', () => { + expect(getMaxDefinitionLevel(path)).toBe(2) + }) +}) diff --git a/test/write.buffer.test.js b/test/write.buffer.test.js index ab1a2fb..4b875b6 100644 --- a/test/write.buffer.test.js +++ b/test/write.buffer.test.js @@ -285,13 +285,6 @@ describe('parquetWriteBuffer', () => { .toThrow('parquet expected Uint8Array of length 16') }) - it('throws for empty column with no type specified', () => { - expect(() => parquetWriteBuffer({ columnData: [{ name: 'empty', data: [] }] })) - .toThrow('column empty cannot determine type') - expect(() => parquetWriteBuffer({ columnData: [{ name: 'empty', data: [null, null, null, null] }] })) - .toThrow('column empty cannot determine type') - }) - it('throws for mixed types', () => { expect(() => parquetWriteBuffer({ columnData: [{ name: 'mixed', data: [1, 2, 3, 'boom'] }] })) .toThrow('mixed types not supported') diff --git a/test/write.schema.test.js b/test/write.schema.test.js index cb8b29c..7be8d55 100644 --- a/test/write.schema.test.js +++ b/test/write.schema.test.js @@ -63,6 +63,42 @@ describe('parquet schema', () => { ]) }) + it('allow zero rows to be auto-typed', () => { + const file = parquetWriteBuffer({ columnData: [ + { name: 'numbers', data: [] }, + ] }) + const metadata = parquetMetadata(file) + expect(metadata.schema).toEqual([ + { + name: 'root', + num_children: 1, + }, + { + name: 'numbers', + repetition_type: 'OPTIONAL', + type: 'BYTE_ARRAY', + }, + ]) + }) + + it('allow entirely null columns to be auto-typed', () => { + const file = parquetWriteBuffer({ columnData: [ + { name: 'numbers', data: [null, null, null] }, + ] }) + const metadata = parquetMetadata(file) + expect(metadata.schema).toEqual([ + { + name: 'root', + num_children: 1, + }, + { + name: 'numbers', + repetition_type: 'OPTIONAL', + type: 'BYTE_ARRAY', + }, + ]) + }) + it('accepts explicit schema', () => { const file = parquetWriteBuffer({ columnData: [ { name: 'numbers', data: [1, 2, 3] },