From 5957c219bc7aa06f7be9e67cd5a469559e8da1cd Mon Sep 17 00:00:00 2001
From: Kenny Daniel <platypii@gmail.com>
Date: Sat, 31 May 2025 22:53:24 -0700
Subject: [PATCH] Allow null columns to be auto-typed

---
 src/schema.js             |   7 ++-
 test/schema.test.js       | 123 ++++++++++++++++++++++++++++++++++++++
 test/write.buffer.test.js |   7 ---
 test/write.schema.test.js |  36 +++++++++++
 4 files changed, 165 insertions(+), 8 deletions(-)
 create mode 100644 test/schema.test.js

diff --git a/src/schema.js b/src/schema.js
index 2e999ec..bb5c8da 100644
--- a/src/schema.js
+++ b/src/schema.js
@@ -133,7 +133,12 @@ export function autoSchemaElement(name, values) {
       }
     }
   }
-  if (!type) throw new Error(`column ${name} cannot determine type`)
+  if (!type) {
+    // fallback to nullable BYTE_ARRAY
+    // TODO: logical_type: 'NULL'
+    type = 'BYTE_ARRAY'
+    repetition_type = 'OPTIONAL'
+  }
   return { name, type, repetition_type, converted_type }
 }
 
diff --git a/test/schema.test.js b/test/schema.test.js
new file mode 100644
index 0000000..f235493
--- /dev/null
+++ b/test/schema.test.js
@@ -0,0 +1,123 @@
+import { describe, expect, it } from 'vitest'
+import { autoSchemaElement, getMaxDefinitionLevel, getMaxRepetitionLevel, schemaFromColumnData } from '../src/schema.js'
+
+/**
+ * @import {SchemaElement} from 'hyparquet'
+ */
+
+describe('schemaFromColumnData', () => {
+  it('honours provided type with nullable = false → REQUIRED', () => {
+    const schema = schemaFromColumnData({
+      columnData: [
+        { name: 'id', data: new Int32Array([1, 2, 3]), type: 'INT32', nullable: false },
+      ],
+    })
+    expect(schema[1]).toEqual({ name: 'id', type: 'INT32', repetition_type: 'REQUIRED' })
+  })
+
+  it('applies valid schema override verbatim', () => {
+    const schema = schemaFromColumnData({
+      columnData: [{ name: 'strings', data: ['a', 'b'] }],
+      schemaOverrides: {
+        strings: {
+          name: 'strings',
+          type: 'BYTE_ARRAY',
+          converted_type: 'UTF8',
+          repetition_type: 'OPTIONAL',
+        },
+      },
+    })
+    expect(schema[1].name).toBe('strings')
+    expect(schema[1].type).toBe('BYTE_ARRAY')
+    expect(schema[1].converted_type).toBe('UTF8')
+    expect(schema[1].repetition_type).toBe('OPTIONAL')
+  })
+
+  it('throws when column lengths differ', () => {
+    expect(() =>
+      schemaFromColumnData({
+        columnData: [
+          { name: 'a', data: new Int32Array([1]) },
+          { name: 'b', data: new Int32Array([1, 2]) },
+        ],
+      })
+    ).toThrow(/columns must have the same length/)
+  })
+
+  it('rejects override type REPEATED', () => {
+    expect(() =>
+      schemaFromColumnData({
+        columnData: [{ name: 'x', data: new Int32Array([1]) }],
+        schemaOverrides: { x: { name: 'x', type: 'INT32', repetition_type: 'REPEATED' } },
+      })
+    ).toThrow(/cannot be repeated/)
+  })
+
+  it('rejects override with children', () => {
+    expect(() =>
+      schemaFromColumnData({
+        columnData: [{ name: 'x', data: new Int32Array([1]) }],
+        schemaOverrides: { x: { name: 'x', type: 'INT32', num_children: 1 } },
+      })
+    ).toThrow(/cannot have children/)
+  })
+
+  it('rejects override with mismatched name', () => {
+    expect(() =>
+      schemaFromColumnData({
+        columnData: [{ name: 'x', data: new Int32Array([1]) }],
+        schemaOverrides: { x: { name: 'y', type: 'INT32' } },
+      })
+    ).toThrow(/does not match column name/)
+  })
+})
+
+describe('autoSchemaElement', () => {
+  it.each([
+    [new Int32Array([1, 2]), 'INT32'],
+    [new BigInt64Array([1n, 2n]), 'INT64'],
+    [new Float32Array([1, 2]), 'FLOAT'],
+    [new Float64Array([1, 2]), 'DOUBLE'],
+  ])('detects typed arrays (%#)', (data, expected) => {
+    const el = autoSchemaElement('col', data)
+    expect(el.type).toBe(expected)
+    expect(el.repetition_type).toBe('REQUIRED')
+  })
+
+  it('promotes INT32 + DOUBLE mix to DOUBLE', () => {
+    const el = autoSchemaElement('mix', [1, 2.5])
+    expect(el.type).toBe('DOUBLE')
+  })
+
+  it('sets repetition_type OPTIONAL when nulls present', () => {
+    const el = autoSchemaElement('maybe', [null, 1])
+    expect(el.repetition_type).toBe('OPTIONAL')
+  })
+
+  it('falls back to BYTE_ARRAY for empty arrays', () => {
+    const el = autoSchemaElement('empty', [])
+    expect(el.type).toBe('BYTE_ARRAY')
+    expect(el.repetition_type).toBe('OPTIONAL')
+  })
+
+  it('throws on incompatible mixed scalar types', () => {
+    expect(() => autoSchemaElement('bad', [1, 'a'])).toThrow(/mixed types/)
+  })
+})
+
+describe('level helpers', () => {
+  /** @type {SchemaElement[]} */
+  const path = [
+    { name: 'root', repetition_type: 'REPEATED' },
+    { name: 'child', repetition_type: 'OPTIONAL' },
+    { name: 'leaf', repetition_type: 'REPEATED' },
+  ]
+
+  it('computes max repetition level', () => {
+    expect(getMaxRepetitionLevel(path)).toBe(2)
+  })
+
+  it('computes max definition level', () => {
+    expect(getMaxDefinitionLevel(path)).toBe(2)
+  })
+})
diff --git a/test/write.buffer.test.js b/test/write.buffer.test.js
index ab1a2fb..4b875b6 100644
--- a/test/write.buffer.test.js
+++ b/test/write.buffer.test.js
@@ -285,13 +285,6 @@ describe('parquetWriteBuffer', () => {
       .toThrow('parquet expected Uint8Array of length 16')
   })
 
-  it('throws for empty column with no type specified', () => {
-    expect(() => parquetWriteBuffer({ columnData: [{ name: 'empty', data: [] }] }))
-      .toThrow('column empty cannot determine type')
-    expect(() => parquetWriteBuffer({ columnData: [{ name: 'empty', data: [null, null, null, null] }] }))
-      .toThrow('column empty cannot determine type')
-  })
-
   it('throws for mixed types', () => {
     expect(() => parquetWriteBuffer({ columnData: [{ name: 'mixed', data: [1, 2, 3, 'boom'] }] }))
       .toThrow('mixed types not supported')
diff --git a/test/write.schema.test.js b/test/write.schema.test.js
index cb8b29c..7be8d55 100644
--- a/test/write.schema.test.js
+++ b/test/write.schema.test.js
@@ -63,6 +63,42 @@ describe('parquet schema', () => {
     ])
   })
 
+  it('allow zero rows to be auto-typed', () => {
+    const file = parquetWriteBuffer({ columnData: [
+      { name: 'numbers', data: [] },
+    ] })
+    const metadata = parquetMetadata(file)
+    expect(metadata.schema).toEqual([
+      {
+        name: 'root',
+        num_children: 1,
+      },
+      {
+        name: 'numbers',
+        repetition_type: 'OPTIONAL',
+        type: 'BYTE_ARRAY',
+      },
+    ])
+  })
+
+  it('allow entirely null columns to be auto-typed', () => {
+    const file = parquetWriteBuffer({ columnData: [
+      { name: 'numbers', data: [null, null, null] },
+    ] })
+    const metadata = parquetMetadata(file)
+    expect(metadata.schema).toEqual([
+      {
+        name: 'root',
+        num_children: 1,
+      },
+      {
+        name: 'numbers',
+        repetition_type: 'OPTIONAL',
+        type: 'BYTE_ARRAY',
+      },
+    ])
+  })
+
   it('accepts explicit schema', () => {
     const file = parquetWriteBuffer({ columnData: [
       { name: 'numbers', data: [1, 2, 3] },