Allow null columns to be auto-typed

This commit is contained in:
Kenny Daniel 2025-05-31 22:53:24 -07:00
parent 3f21e329cc
commit 5957c219bc
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
4 changed files with 165 additions and 8 deletions

@ -133,7 +133,12 @@ export function autoSchemaElement(name, values) {
}
}
}
if (!type) throw new Error(`column ${name} cannot determine type`)
if (!type) {
// fallback to nullable BYTE_ARRAY
// TODO: logical_type: 'NULL'
type = 'BYTE_ARRAY'
repetition_type = 'OPTIONAL'
}
return { name, type, repetition_type, converted_type }
}

123
test/schema.test.js Normal file

@ -0,0 +1,123 @@
import { describe, expect, it } from 'vitest'
import { autoSchemaElement, getMaxDefinitionLevel, getMaxRepetitionLevel, schemaFromColumnData } from '../src/schema.js'
/**
* @import {SchemaElement} from 'hyparquet'
*/
describe('schemaFromColumnData', () => {
it('honours provided type with nullable = false → REQUIRED', () => {
const schema = schemaFromColumnData({
columnData: [
{ name: 'id', data: new Int32Array([1, 2, 3]), type: 'INT32', nullable: false },
],
})
expect(schema[1]).toEqual({ name: 'id', type: 'INT32', repetition_type: 'REQUIRED' })
})
it('applies valid schema override verbatim', () => {
const schema = schemaFromColumnData({
columnData: [{ name: 'strings', data: ['a', 'b'] }],
schemaOverrides: {
strings: {
name: 'strings',
type: 'BYTE_ARRAY',
converted_type: 'UTF8',
repetition_type: 'OPTIONAL',
},
},
})
expect(schema[1].name).toBe('strings')
expect(schema[1].type).toBe('BYTE_ARRAY')
expect(schema[1].converted_type).toBe('UTF8')
expect(schema[1].repetition_type).toBe('OPTIONAL')
})
it('throws when column lengths differ', () => {
expect(() =>
schemaFromColumnData({
columnData: [
{ name: 'a', data: new Int32Array([1]) },
{ name: 'b', data: new Int32Array([1, 2]) },
],
})
).toThrow(/columns must have the same length/)
})
it('rejects override type REPEATED', () => {
expect(() =>
schemaFromColumnData({
columnData: [{ name: 'x', data: new Int32Array([1]) }],
schemaOverrides: { x: { name: 'x', type: 'INT32', repetition_type: 'REPEATED' } },
})
).toThrow(/cannot be repeated/)
})
it('rejects override with children', () => {
expect(() =>
schemaFromColumnData({
columnData: [{ name: 'x', data: new Int32Array([1]) }],
schemaOverrides: { x: { name: 'x', type: 'INT32', num_children: 1 } },
})
).toThrow(/cannot have children/)
})
it('rejects override with mismatched name', () => {
expect(() =>
schemaFromColumnData({
columnData: [{ name: 'x', data: new Int32Array([1]) }],
schemaOverrides: { x: { name: 'y', type: 'INT32' } },
})
).toThrow(/does not match column name/)
})
})
describe('autoSchemaElement', () => {
it.each([
[new Int32Array([1, 2]), 'INT32'],
[new BigInt64Array([1n, 2n]), 'INT64'],
[new Float32Array([1, 2]), 'FLOAT'],
[new Float64Array([1, 2]), 'DOUBLE'],
])('detects typed arrays (%#)', (data, expected) => {
const el = autoSchemaElement('col', data)
expect(el.type).toBe(expected)
expect(el.repetition_type).toBe('REQUIRED')
})
it('promotes INT32 + DOUBLE mix to DOUBLE', () => {
const el = autoSchemaElement('mix', [1, 2.5])
expect(el.type).toBe('DOUBLE')
})
it('sets repetition_type OPTIONAL when nulls present', () => {
const el = autoSchemaElement('maybe', [null, 1])
expect(el.repetition_type).toBe('OPTIONAL')
})
it('falls back to BYTE_ARRAY for empty arrays', () => {
const el = autoSchemaElement('empty', [])
expect(el.type).toBe('BYTE_ARRAY')
expect(el.repetition_type).toBe('OPTIONAL')
})
it('throws on incompatible mixed scalar types', () => {
expect(() => autoSchemaElement('bad', [1, 'a'])).toThrow(/mixed types/)
})
})
describe('level helpers', () => {
/** @type {SchemaElement[]} */
const path = [
{ name: 'root', repetition_type: 'REPEATED' },
{ name: 'child', repetition_type: 'OPTIONAL' },
{ name: 'leaf', repetition_type: 'REPEATED' },
]
it('computes max repetition level', () => {
expect(getMaxRepetitionLevel(path)).toBe(2)
})
it('computes max definition level', () => {
expect(getMaxDefinitionLevel(path)).toBe(2)
})
})

@ -285,13 +285,6 @@ describe('parquetWriteBuffer', () => {
.toThrow('parquet expected Uint8Array of length 16')
})
it('throws for empty column with no type specified', () => {
expect(() => parquetWriteBuffer({ columnData: [{ name: 'empty', data: [] }] }))
.toThrow('column empty cannot determine type')
expect(() => parquetWriteBuffer({ columnData: [{ name: 'empty', data: [null, null, null, null] }] }))
.toThrow('column empty cannot determine type')
})
it('throws for mixed types', () => {
expect(() => parquetWriteBuffer({ columnData: [{ name: 'mixed', data: [1, 2, 3, 'boom'] }] }))
.toThrow('mixed types not supported')

@ -63,6 +63,42 @@ describe('parquet schema', () => {
])
})
it('allow zero rows to be auto-typed', () => {
const file = parquetWriteBuffer({ columnData: [
{ name: 'numbers', data: [] },
] })
const metadata = parquetMetadata(file)
expect(metadata.schema).toEqual([
{
name: 'root',
num_children: 1,
},
{
name: 'numbers',
repetition_type: 'OPTIONAL',
type: 'BYTE_ARRAY',
},
])
})
it('allow entirely null columns to be auto-typed', () => {
const file = parquetWriteBuffer({ columnData: [
{ name: 'numbers', data: [null, null, null] },
] })
const metadata = parquetMetadata(file)
expect(metadata.schema).toEqual([
{
name: 'root',
num_children: 1,
},
{
name: 'numbers',
repetition_type: 'OPTIONAL',
type: 'BYTE_ARRAY',
},
])
})
it('accepts explicit schema', () => {
const file = parquetWriteBuffer({ columnData: [
{ name: 'numbers', data: [1, 2, 3] },