hyparquet-writer/test/write.schema.test.js
Kenny Daniel 791a70b983
Refactor api to support arbitrary parquet schemas (#3)
* Refactor api to support arbitrary parquet schemas

* More detail in README and error messages

* Export schema tools with optional overrides

* Test for basic timestamp type

* Fix time-type tests

* schemaFromColumnData should take an options object

* Update README with schemaFromColumnData options
2025-05-09 14:47:22 -07:00

134 lines
3.3 KiB
JavaScript

import { parquetMetadata } from 'hyparquet'
import { describe, expect, it } from 'vitest'
import { parquetWriteBuffer, schemaFromColumnData } from '../src/index.js'
describe('parquet schema', () => {
it('auto detects types', () => {
const file = parquetWriteBuffer({ columnData: [
{ name: 'strings', data: ['1', '2', '3'] },
] })
const metadata = parquetMetadata(file)
expect(metadata.schema).toEqual([
{
name: 'root',
num_children: 1,
},
{
converted_type: 'UTF8',
name: 'strings',
repetition_type: 'REQUIRED',
type: 'BYTE_ARRAY',
},
])
})
it('accepts basic type hints', () => {
const file = parquetWriteBuffer({ columnData: [
{
name: 'timestamps',
data: [new Date(1000000), new Date(2000000), new Date(3000000)],
type: 'TIMESTAMP',
},
] })
const metadata = parquetMetadata(file)
expect(metadata.schema).toEqual([
{
name: 'root',
num_children: 1,
},
{
converted_type: 'TIMESTAMP_MILLIS',
name: 'timestamps',
repetition_type: 'OPTIONAL',
type: 'INT64',
},
])
})
it('accepts nullable basic type hints', () => {
const file = parquetWriteBuffer({ columnData: [
{ name: 'numbers', data: [1, 2, 3], type: 'FLOAT', nullable: false },
] })
const metadata = parquetMetadata(file)
expect(metadata.schema).toEqual([
{
name: 'root',
num_children: 1,
},
{
name: 'numbers',
repetition_type: 'REQUIRED',
type: 'FLOAT',
},
])
})
it('accepts explicit schema', () => {
const file = parquetWriteBuffer({ columnData: [
{ name: 'numbers', data: [1, 2, 3] },
], schema: [
{ name: 'root', num_children: 1 },
{ name: 'numbers', type: 'FLOAT', repetition_type: 'REQUIRED' },
] })
const metadata = parquetMetadata(file)
expect(metadata.schema).toEqual([
{
name: 'root',
num_children: 1,
},
{
name: 'numbers',
repetition_type: 'REQUIRED',
type: 'FLOAT',
},
])
})
it('accepts schema override', () => {
const columnData = [
{ name: 'numbers', data: [1, 2, 3] },
]
const file = parquetWriteBuffer({
columnData,
schema: schemaFromColumnData({
columnData,
schemaOverrides: {
numbers: {
name: 'numbers',
type: 'DOUBLE',
repetition_type: 'OPTIONAL',
field_id: 1,
},
},
}),
})
const metadata = parquetMetadata(file)
expect(metadata.schema).toEqual([
{
name: 'root',
num_children: 1,
},
{
field_id: 1,
name: 'numbers',
repetition_type: 'OPTIONAL',
type: 'DOUBLE',
},
])
})
it('throws if basic types conflict with schema', () => {
expect(() => {
parquetWriteBuffer({
columnData: [
{ name: 'numbers', data: [1, 2, 3], type: 'FLOAT' },
],
schema: [
{ name: 'root', num_children: 1 },
{ name: 'numbers', type: 'DOUBLE', repetition_type: 'OPTIONAL' },
],
})
}).toThrow('cannot provide both schema and columnData type')
})
})