mirror of
https://github.com/asadbek064/hyparquet-writer.git
synced 2025-12-05 23:31:54 +00:00
Allow null columns to be auto-typed
This commit is contained in:
parent
3f21e329cc
commit
5957c219bc
@ -133,7 +133,12 @@ export function autoSchemaElement(name, values) {
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!type) throw new Error(`column ${name} cannot determine type`)
|
||||
if (!type) {
|
||||
// fallback to nullable BYTE_ARRAY
|
||||
// TODO: logical_type: 'NULL'
|
||||
type = 'BYTE_ARRAY'
|
||||
repetition_type = 'OPTIONAL'
|
||||
}
|
||||
return { name, type, repetition_type, converted_type }
|
||||
}
|
||||
|
||||
|
||||
123
test/schema.test.js
Normal file
123
test/schema.test.js
Normal file
@ -0,0 +1,123 @@
|
||||
import { describe, expect, it } from 'vitest'
|
||||
import { autoSchemaElement, getMaxDefinitionLevel, getMaxRepetitionLevel, schemaFromColumnData } from '../src/schema.js'
|
||||
|
||||
/**
|
||||
* @import {SchemaElement} from 'hyparquet'
|
||||
*/
|
||||
|
||||
describe('schemaFromColumnData', () => {
|
||||
it('honours provided type with nullable = false → REQUIRED', () => {
|
||||
const schema = schemaFromColumnData({
|
||||
columnData: [
|
||||
{ name: 'id', data: new Int32Array([1, 2, 3]), type: 'INT32', nullable: false },
|
||||
],
|
||||
})
|
||||
expect(schema[1]).toEqual({ name: 'id', type: 'INT32', repetition_type: 'REQUIRED' })
|
||||
})
|
||||
|
||||
it('applies valid schema override verbatim', () => {
|
||||
const schema = schemaFromColumnData({
|
||||
columnData: [{ name: 'strings', data: ['a', 'b'] }],
|
||||
schemaOverrides: {
|
||||
strings: {
|
||||
name: 'strings',
|
||||
type: 'BYTE_ARRAY',
|
||||
converted_type: 'UTF8',
|
||||
repetition_type: 'OPTIONAL',
|
||||
},
|
||||
},
|
||||
})
|
||||
expect(schema[1].name).toBe('strings')
|
||||
expect(schema[1].type).toBe('BYTE_ARRAY')
|
||||
expect(schema[1].converted_type).toBe('UTF8')
|
||||
expect(schema[1].repetition_type).toBe('OPTIONAL')
|
||||
})
|
||||
|
||||
it('throws when column lengths differ', () => {
|
||||
expect(() =>
|
||||
schemaFromColumnData({
|
||||
columnData: [
|
||||
{ name: 'a', data: new Int32Array([1]) },
|
||||
{ name: 'b', data: new Int32Array([1, 2]) },
|
||||
],
|
||||
})
|
||||
).toThrow(/columns must have the same length/)
|
||||
})
|
||||
|
||||
it('rejects override type REPEATED', () => {
|
||||
expect(() =>
|
||||
schemaFromColumnData({
|
||||
columnData: [{ name: 'x', data: new Int32Array([1]) }],
|
||||
schemaOverrides: { x: { name: 'x', type: 'INT32', repetition_type: 'REPEATED' } },
|
||||
})
|
||||
).toThrow(/cannot be repeated/)
|
||||
})
|
||||
|
||||
it('rejects override with children', () => {
|
||||
expect(() =>
|
||||
schemaFromColumnData({
|
||||
columnData: [{ name: 'x', data: new Int32Array([1]) }],
|
||||
schemaOverrides: { x: { name: 'x', type: 'INT32', num_children: 1 } },
|
||||
})
|
||||
).toThrow(/cannot have children/)
|
||||
})
|
||||
|
||||
it('rejects override with mismatched name', () => {
|
||||
expect(() =>
|
||||
schemaFromColumnData({
|
||||
columnData: [{ name: 'x', data: new Int32Array([1]) }],
|
||||
schemaOverrides: { x: { name: 'y', type: 'INT32' } },
|
||||
})
|
||||
).toThrow(/does not match column name/)
|
||||
})
|
||||
})
|
||||
|
||||
describe('autoSchemaElement', () => {
|
||||
it.each([
|
||||
[new Int32Array([1, 2]), 'INT32'],
|
||||
[new BigInt64Array([1n, 2n]), 'INT64'],
|
||||
[new Float32Array([1, 2]), 'FLOAT'],
|
||||
[new Float64Array([1, 2]), 'DOUBLE'],
|
||||
])('detects typed arrays (%#)', (data, expected) => {
|
||||
const el = autoSchemaElement('col', data)
|
||||
expect(el.type).toBe(expected)
|
||||
expect(el.repetition_type).toBe('REQUIRED')
|
||||
})
|
||||
|
||||
it('promotes INT32 + DOUBLE mix to DOUBLE', () => {
|
||||
const el = autoSchemaElement('mix', [1, 2.5])
|
||||
expect(el.type).toBe('DOUBLE')
|
||||
})
|
||||
|
||||
it('sets repetition_type OPTIONAL when nulls present', () => {
|
||||
const el = autoSchemaElement('maybe', [null, 1])
|
||||
expect(el.repetition_type).toBe('OPTIONAL')
|
||||
})
|
||||
|
||||
it('falls back to BYTE_ARRAY for empty arrays', () => {
|
||||
const el = autoSchemaElement('empty', [])
|
||||
expect(el.type).toBe('BYTE_ARRAY')
|
||||
expect(el.repetition_type).toBe('OPTIONAL')
|
||||
})
|
||||
|
||||
it('throws on incompatible mixed scalar types', () => {
|
||||
expect(() => autoSchemaElement('bad', [1, 'a'])).toThrow(/mixed types/)
|
||||
})
|
||||
})
|
||||
|
||||
describe('level helpers', () => {
|
||||
/** @type {SchemaElement[]} */
|
||||
const path = [
|
||||
{ name: 'root', repetition_type: 'REPEATED' },
|
||||
{ name: 'child', repetition_type: 'OPTIONAL' },
|
||||
{ name: 'leaf', repetition_type: 'REPEATED' },
|
||||
]
|
||||
|
||||
it('computes max repetition level', () => {
|
||||
expect(getMaxRepetitionLevel(path)).toBe(2)
|
||||
})
|
||||
|
||||
it('computes max definition level', () => {
|
||||
expect(getMaxDefinitionLevel(path)).toBe(2)
|
||||
})
|
||||
})
|
||||
@ -285,13 +285,6 @@ describe('parquetWriteBuffer', () => {
|
||||
.toThrow('parquet expected Uint8Array of length 16')
|
||||
})
|
||||
|
||||
it('throws for empty column with no type specified', () => {
|
||||
expect(() => parquetWriteBuffer({ columnData: [{ name: 'empty', data: [] }] }))
|
||||
.toThrow('column empty cannot determine type')
|
||||
expect(() => parquetWriteBuffer({ columnData: [{ name: 'empty', data: [null, null, null, null] }] }))
|
||||
.toThrow('column empty cannot determine type')
|
||||
})
|
||||
|
||||
it('throws for mixed types', () => {
|
||||
expect(() => parquetWriteBuffer({ columnData: [{ name: 'mixed', data: [1, 2, 3, 'boom'] }] }))
|
||||
.toThrow('mixed types not supported')
|
||||
|
||||
@ -63,6 +63,42 @@ describe('parquet schema', () => {
|
||||
])
|
||||
})
|
||||
|
||||
it('allow zero rows to be auto-typed', () => {
|
||||
const file = parquetWriteBuffer({ columnData: [
|
||||
{ name: 'numbers', data: [] },
|
||||
] })
|
||||
const metadata = parquetMetadata(file)
|
||||
expect(metadata.schema).toEqual([
|
||||
{
|
||||
name: 'root',
|
||||
num_children: 1,
|
||||
},
|
||||
{
|
||||
name: 'numbers',
|
||||
repetition_type: 'OPTIONAL',
|
||||
type: 'BYTE_ARRAY',
|
||||
},
|
||||
])
|
||||
})
|
||||
|
||||
it('allow entirely null columns to be auto-typed', () => {
|
||||
const file = parquetWriteBuffer({ columnData: [
|
||||
{ name: 'numbers', data: [null, null, null] },
|
||||
] })
|
||||
const metadata = parquetMetadata(file)
|
||||
expect(metadata.schema).toEqual([
|
||||
{
|
||||
name: 'root',
|
||||
num_children: 1,
|
||||
},
|
||||
{
|
||||
name: 'numbers',
|
||||
repetition_type: 'OPTIONAL',
|
||||
type: 'BYTE_ARRAY',
|
||||
},
|
||||
])
|
||||
})
|
||||
|
||||
it('accepts explicit schema', () => {
|
||||
const file = parquetWriteBuffer({ columnData: [
|
||||
{ name: 'numbers', data: [1, 2, 3] },
|
||||
|
||||
Loading…
Reference in New Issue
Block a user