mirror of
https://github.com/asadbek064/hyparquet-writer.git
synced 2025-12-05 23:31:54 +00:00
Allow specifying column type
This commit is contained in:
parent
b6b2a24a05
commit
4bf1595981
@ -16,8 +16,8 @@ import { parquetWrite } from 'hyparquet-writer'
|
||||
|
||||
const arrayBuffer = parquetWrite({
|
||||
columnData: [
|
||||
{ name: 'name', data: ['Alice', 'Bob', 'Charlie'] },
|
||||
{ name: 'age', data: [25, 30, 35] },
|
||||
{ name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'STRING' },
|
||||
{ name: 'age', data: [25, 30, 35], type: 'INT32' },
|
||||
],
|
||||
})
|
||||
```
|
||||
|
||||
@ -5,15 +5,14 @@
|
||||
* @import {ConvertedType, DecodedArray, FieldRepetitionType, ParquetType, SchemaElement} from 'hyparquet'
|
||||
* @param {string} name
|
||||
* @param {DecodedArray} values
|
||||
* @param {ParquetType | undefined} type
|
||||
* @returns {SchemaElement}
|
||||
*/
|
||||
export function getSchemaElementForValues(name, values) {
|
||||
export function getSchemaElementForValues(name, values, type) {
|
||||
if (values instanceof Int32Array) return { name, type: 'INT32', repetition_type: 'REQUIRED' }
|
||||
if (values instanceof BigInt64Array) return { name, type: 'INT64', repetition_type: 'REQUIRED' }
|
||||
if (values instanceof Float32Array) return { name, type: 'FLOAT', repetition_type: 'REQUIRED' }
|
||||
if (values instanceof Float64Array) return { name, type: 'DOUBLE', repetition_type: 'REQUIRED' }
|
||||
/** @type {ParquetType | undefined} */
|
||||
let type = undefined
|
||||
/** @type {FieldRepetitionType} */
|
||||
let repetition_type = 'REQUIRED'
|
||||
/** @type {ConvertedType | undefined} */
|
||||
|
||||
1
src/types.d.ts
vendored
1
src/types.d.ts
vendored
@ -3,6 +3,7 @@ import { DecodedArray, ParquetType } from "hyparquet"
|
||||
export interface ColumnData {
|
||||
name: string
|
||||
data: DecodedArray
|
||||
type?: ParquetType
|
||||
}
|
||||
|
||||
export interface Writer {
|
||||
|
||||
@ -39,9 +39,9 @@ export function parquetWrite({ columnData, compressed = true }) {
|
||||
const columns = []
|
||||
|
||||
// Write columns
|
||||
for (const { name, data } of columnData) {
|
||||
for (const { name, data, type } of columnData) {
|
||||
// auto-detect type
|
||||
const schemaElement = getSchemaElementForValues(name, data)
|
||||
const schemaElement = getSchemaElementForValues(name, data, type)
|
||||
if (!schemaElement.type) throw new Error(`column ${name} cannot determine type`)
|
||||
const file_offset = BigInt(writer.offset)
|
||||
/** @type {SchemaElement[]} */
|
||||
|
||||
@ -136,6 +136,32 @@ describe('parquetWrite', () => {
|
||||
])
|
||||
})
|
||||
|
||||
it('serializes empty column', async () => {
|
||||
const result = await roundTripDeserialize([{
|
||||
name: 'empty',
|
||||
data: [null, null, null, null],
|
||||
type: 'BOOLEAN',
|
||||
}])
|
||||
expect(result).toEqual([
|
||||
{ empty: null },
|
||||
{ empty: null },
|
||||
{ empty: null },
|
||||
{ empty: null },
|
||||
])
|
||||
})
|
||||
|
||||
it('throws for wrong type specified', () => {
|
||||
expect(() => parquetWrite({ columnData: [{ name: 'int', data: [1, 2, 3], type: 'BOOLEAN' }] }))
|
||||
.toThrow('parquet cannot write mixed types')
|
||||
})
|
||||
|
||||
it('throws for empty column with no type specified', () => {
|
||||
expect(() => parquetWrite({ columnData: [{ name: 'empty', data: [] }] }))
|
||||
.toThrow('column empty cannot determine type')
|
||||
expect(() => parquetWrite({ columnData: [{ name: 'empty', data: [null, null, null, null] }] }))
|
||||
.toThrow('column empty cannot determine type')
|
||||
})
|
||||
|
||||
it('throws for mixed types', () => {
|
||||
expect(() => parquetWrite({ columnData: [{ name: 'mixed', data: [1, 2, 3, 'boom'] }] }))
|
||||
.toThrow('mixed types not supported')
|
||||
|
||||
Loading…
Reference in New Issue
Block a user