Allow specifying column type

This commit is contained in:
Kenny Daniel 2025-03-28 16:13:27 -07:00
parent b6b2a24a05
commit 4bf1595981
No known key found for this signature in database
GPG Key ID: FDF16101AF5AFD3A
5 changed files with 33 additions and 7 deletions

@ -16,8 +16,8 @@ import { parquetWrite } from 'hyparquet-writer'
const arrayBuffer = parquetWrite({
columnData: [
{ name: 'name', data: ['Alice', 'Bob', 'Charlie'] },
{ name: 'age', data: [25, 30, 35] },
{ name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'STRING' },
{ name: 'age', data: [25, 30, 35], type: 'INT32' },
],
})
```

@ -5,15 +5,14 @@
* @import {ConvertedType, DecodedArray, FieldRepetitionType, ParquetType, SchemaElement} from 'hyparquet'
* @param {string} name
* @param {DecodedArray} values
* @param {ParquetType | undefined} type
* @returns {SchemaElement}
*/
export function getSchemaElementForValues(name, values) {
export function getSchemaElementForValues(name, values, type) {
if (values instanceof Int32Array) return { name, type: 'INT32', repetition_type: 'REQUIRED' }
if (values instanceof BigInt64Array) return { name, type: 'INT64', repetition_type: 'REQUIRED' }
if (values instanceof Float32Array) return { name, type: 'FLOAT', repetition_type: 'REQUIRED' }
if (values instanceof Float64Array) return { name, type: 'DOUBLE', repetition_type: 'REQUIRED' }
/** @type {ParquetType | undefined} */
let type = undefined
/** @type {FieldRepetitionType} */
let repetition_type = 'REQUIRED'
/** @type {ConvertedType | undefined} */

1
src/types.d.ts vendored

@ -3,6 +3,7 @@ import { DecodedArray, ParquetType } from "hyparquet"
export interface ColumnData {
name: string
data: DecodedArray
type?: ParquetType
}
export interface Writer {

@ -39,9 +39,9 @@ export function parquetWrite({ columnData, compressed = true }) {
const columns = []
// Write columns
for (const { name, data } of columnData) {
for (const { name, data, type } of columnData) {
// auto-detect type
const schemaElement = getSchemaElementForValues(name, data)
const schemaElement = getSchemaElementForValues(name, data, type)
if (!schemaElement.type) throw new Error(`column ${name} cannot determine type`)
const file_offset = BigInt(writer.offset)
/** @type {SchemaElement[]} */

@ -136,6 +136,32 @@ describe('parquetWrite', () => {
])
})
it('serializes empty column', async () => {
const result = await roundTripDeserialize([{
name: 'empty',
data: [null, null, null, null],
type: 'BOOLEAN',
}])
expect(result).toEqual([
{ empty: null },
{ empty: null },
{ empty: null },
{ empty: null },
])
})
it('throws for wrong type specified', () => {
expect(() => parquetWrite({ columnData: [{ name: 'int', data: [1, 2, 3], type: 'BOOLEAN' }] }))
.toThrow('parquet cannot write mixed types')
})
it('throws for empty column with no type specified', () => {
expect(() => parquetWrite({ columnData: [{ name: 'empty', data: [] }] }))
.toThrow('column empty cannot determine type')
expect(() => parquetWrite({ columnData: [{ name: 'empty', data: [null, null, null, null] }] }))
.toThrow('column empty cannot determine type')
})
it('throws for mixed types', () => {
expect(() => parquetWrite({ columnData: [{ name: 'mixed', data: [1, 2, 3, 'boom'] }] }))
.toThrow('mixed types not supported')