2025-03-26 04:06:43 +00:00
|
|
|
import { parquetMetadata, parquetReadObjects } from 'hyparquet'
|
|
|
|
|
import { describe, expect, it } from 'vitest'
|
2025-04-08 10:22:30 +00:00
|
|
|
import { parquetWriteBuffer } from '../src/index.js'
|
2025-04-14 04:15:29 +00:00
|
|
|
import { exampleData, exampleMetadata } from './example.js'
|
2025-03-26 04:06:43 +00:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Utility to encode a parquet file and then read it back into a JS object.
|
|
|
|
|
*
|
2025-03-26 07:45:22 +00:00
|
|
|
* @import {ColumnData} from '../src/types.js'
|
|
|
|
|
* @param {ColumnData[]} columnData
|
2025-03-26 04:06:43 +00:00
|
|
|
* @returns {Promise<Record<string, any>>}
|
|
|
|
|
*/
|
|
|
|
|
async function roundTripDeserialize(columnData) {
|
2025-04-08 10:22:30 +00:00
|
|
|
const file = parquetWriteBuffer({ columnData })
|
2025-03-26 07:45:22 +00:00
|
|
|
return await parquetReadObjects({ file, utf8: false })
|
2025-03-26 04:06:43 +00:00
|
|
|
}
|
|
|
|
|
|
2025-04-08 10:22:30 +00:00
|
|
|
describe('parquetWriteBuffer', () => {
|
2025-03-26 04:06:43 +00:00
|
|
|
it('writes expected metadata', () => {
|
2025-04-14 04:15:29 +00:00
|
|
|
const file = parquetWriteBuffer({ columnData: exampleData })
|
2025-03-26 04:06:43 +00:00
|
|
|
const metadata = parquetMetadata(file)
|
|
|
|
|
expect(metadata).toEqual(exampleMetadata)
|
|
|
|
|
})
|
|
|
|
|
|
2025-03-26 07:11:14 +00:00
|
|
|
it('serializes basic types', async () => {
|
2025-04-14 04:15:29 +00:00
|
|
|
const result = await roundTripDeserialize(exampleData)
|
2025-03-26 04:06:43 +00:00
|
|
|
expect(result).toEqual([
|
2025-04-08 11:20:32 +00:00
|
|
|
{ bool: true, int: 0, bigint: 0n, float: 0, double: 0, string: 'a', nullable: true },
|
|
|
|
|
{ bool: false, int: 127, bigint: 127n, float: 0.00009999999747378752, double: 0.0001, string: 'b', nullable: false },
|
|
|
|
|
{ bool: true, int: 0x7fff, bigint: 0x7fffn, float: 123.45600128173828, double: 123.456, string: 'c', nullable: null },
|
|
|
|
|
{ bool: false, int: 0x7fffffff, bigint: 0x7fffffffffffffffn, float: Infinity, double: 1e100, string: 'd', nullable: null },
|
2025-03-26 04:06:43 +00:00
|
|
|
])
|
|
|
|
|
})
|
2025-03-26 05:55:02 +00:00
|
|
|
|
2025-03-27 06:46:40 +00:00
|
|
|
it('efficiently serializes sparse booleans', async () => {
|
2025-03-26 05:55:02 +00:00
|
|
|
const bool = Array(10000).fill(null)
|
|
|
|
|
bool[10] = true
|
|
|
|
|
bool[100] = false
|
|
|
|
|
bool[500] = true
|
|
|
|
|
bool[9999] = false
|
2025-04-08 10:22:30 +00:00
|
|
|
const file = parquetWriteBuffer({ columnData: [{ name: 'bool', data: bool }] })
|
2025-04-17 08:09:43 +00:00
|
|
|
expect(file.byteLength).toBe(154)
|
2025-03-27 06:46:40 +00:00
|
|
|
const metadata = parquetMetadata(file)
|
2025-04-17 08:09:43 +00:00
|
|
|
expect(metadata.metadata_length).toBe(92)
|
2025-03-27 06:46:40 +00:00
|
|
|
const result = await parquetReadObjects({ file })
|
|
|
|
|
expect(result.length).toBe(10000)
|
|
|
|
|
expect(result[0]).toEqual({ bool: null })
|
|
|
|
|
expect(result[9]).toEqual({ bool: null })
|
|
|
|
|
expect(result[10]).toEqual({ bool: true })
|
|
|
|
|
expect(result[100]).toEqual({ bool: false })
|
|
|
|
|
expect(result[500]).toEqual({ bool: true })
|
|
|
|
|
expect(result[9999]).toEqual({ bool: false })
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
it('efficiently serializes long string', () => {
|
|
|
|
|
const str = 'a'.repeat(10000)
|
2025-04-08 10:22:30 +00:00
|
|
|
const file = parquetWriteBuffer({ columnData: [{ name: 'string', data: [str] }] })
|
2025-04-17 08:09:43 +00:00
|
|
|
expect(file.byteLength).toBe(638)
|
2025-03-26 05:55:02 +00:00
|
|
|
})
|
2025-03-26 07:11:14 +00:00
|
|
|
|
2025-03-27 07:27:22 +00:00
|
|
|
it('less efficiently serializes string without compression', () => {
|
|
|
|
|
const str = 'a'.repeat(10000)
|
|
|
|
|
const columnData = [{ name: 'string', data: [str] }]
|
2025-04-08 10:22:30 +00:00
|
|
|
const file = parquetWriteBuffer({ columnData, compressed: false })
|
2025-04-17 08:09:43 +00:00
|
|
|
expect(file.byteLength).toBe(10168)
|
2025-03-27 07:27:22 +00:00
|
|
|
})
|
|
|
|
|
|
2025-03-28 06:30:32 +00:00
|
|
|
it('efficiently serializes column with few distinct values', async () => {
|
2025-03-29 19:28:25 +00:00
|
|
|
const data = Array(100000)
|
|
|
|
|
.fill('aaaa', 0, 50000)
|
|
|
|
|
.fill('bbbb', 50000, 100000)
|
2025-04-08 10:22:30 +00:00
|
|
|
const file = parquetWriteBuffer({ columnData: [{ name: 'string', data }], statistics: false })
|
2025-04-17 08:09:43 +00:00
|
|
|
expect(file.byteLength).toBe(170)
|
2025-03-28 06:30:32 +00:00
|
|
|
// round trip
|
|
|
|
|
const result = await parquetReadObjects({ file })
|
2025-03-29 19:28:25 +00:00
|
|
|
expect(result.length).toBe(100000)
|
2025-03-28 06:30:32 +00:00
|
|
|
expect(result[0]).toEqual({ string: 'aaaa' })
|
2025-03-29 19:28:25 +00:00
|
|
|
expect(result[50000]).toEqual({ string: 'bbbb' })
|
2025-03-27 07:27:22 +00:00
|
|
|
})
|
|
|
|
|
|
2025-04-03 20:21:57 +00:00
|
|
|
it('writes statistics when enabled', () => {
|
2025-04-14 04:15:29 +00:00
|
|
|
const withStats = parquetWriteBuffer({ columnData: exampleData, statistics: true })
|
|
|
|
|
const noStats = parquetWriteBuffer({ columnData: exampleData, statistics: false })
|
2025-04-17 08:09:43 +00:00
|
|
|
expect(withStats.byteLength).toBe(721)
|
|
|
|
|
expect(noStats.byteLength).toBe(611)
|
2025-04-03 20:21:57 +00:00
|
|
|
})
|
|
|
|
|
|
2025-03-26 07:11:14 +00:00
|
|
|
it('serializes list types', async () => {
|
2025-03-26 07:45:22 +00:00
|
|
|
const result = await roundTripDeserialize([{
|
|
|
|
|
name: 'list',
|
|
|
|
|
data: [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]],
|
|
|
|
|
}])
|
2025-03-26 07:11:14 +00:00
|
|
|
expect(result).toEqual([
|
|
|
|
|
{ list: [1, 2, 3] },
|
|
|
|
|
{ list: [4, 5, 6] },
|
|
|
|
|
{ list: [7, 8, 9] },
|
|
|
|
|
{ list: [10, 11, 12] },
|
|
|
|
|
])
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
it('serializes object types', async () => {
|
2025-03-26 07:45:22 +00:00
|
|
|
const result = await roundTripDeserialize([{
|
|
|
|
|
name: 'obj',
|
|
|
|
|
data: [{ a: 1, b: 2 }, { a: 3, b: 4 }, { a: 5, b: 6 }, { a: 7, b: 8 }],
|
|
|
|
|
}])
|
2025-03-26 07:11:14 +00:00
|
|
|
expect(result).toEqual([
|
|
|
|
|
{ obj: { a: 1, b: 2 } },
|
|
|
|
|
{ obj: { a: 3, b: 4 } },
|
|
|
|
|
{ obj: { a: 5, b: 6 } },
|
|
|
|
|
{ obj: { a: 7, b: 8 } },
|
|
|
|
|
])
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
it('serializes date types', async () => {
|
2025-03-26 07:45:22 +00:00
|
|
|
const result = await roundTripDeserialize([{
|
|
|
|
|
name: 'date',
|
|
|
|
|
data: [new Date(0), new Date(100000), new Date(200000), new Date(300000)],
|
|
|
|
|
}])
|
2025-03-26 07:11:14 +00:00
|
|
|
expect(result).toEqual([
|
|
|
|
|
{ date: new Date(0) },
|
|
|
|
|
{ date: new Date(100000) },
|
|
|
|
|
{ date: new Date(200000) },
|
|
|
|
|
{ date: new Date(300000) },
|
|
|
|
|
])
|
|
|
|
|
})
|
|
|
|
|
|
2025-03-26 07:45:22 +00:00
|
|
|
it('serializes byte array types', async () => {
|
|
|
|
|
const result = await roundTripDeserialize([{
|
|
|
|
|
name: 'bytes',
|
|
|
|
|
data: [Uint8Array.of(1, 2, 3), Uint8Array.of(4, 5, 6), Uint8Array.of(7, 8, 9), Uint8Array.of(10, 11, 12)],
|
|
|
|
|
}])
|
|
|
|
|
expect(result).toEqual([
|
|
|
|
|
{ bytes: Uint8Array.of(1, 2, 3) },
|
|
|
|
|
{ bytes: Uint8Array.of(4, 5, 6) },
|
|
|
|
|
{ bytes: Uint8Array.of(7, 8, 9) },
|
|
|
|
|
{ bytes: Uint8Array.of(10, 11, 12) },
|
|
|
|
|
])
|
|
|
|
|
})
|
|
|
|
|
|
2025-03-28 23:13:27 +00:00
|
|
|
it('serializes empty column', async () => {
|
|
|
|
|
const result = await roundTripDeserialize([{
|
|
|
|
|
name: 'empty',
|
|
|
|
|
data: [null, null, null, null],
|
|
|
|
|
type: 'BOOLEAN',
|
|
|
|
|
}])
|
|
|
|
|
expect(result).toEqual([
|
|
|
|
|
{ empty: null },
|
|
|
|
|
{ empty: null },
|
|
|
|
|
{ empty: null },
|
|
|
|
|
{ empty: null },
|
|
|
|
|
])
|
|
|
|
|
})
|
|
|
|
|
|
2025-03-29 19:28:25 +00:00
|
|
|
it('serializes empty table', async () => {
|
|
|
|
|
const result = await roundTripDeserialize([])
|
|
|
|
|
expect(result).toEqual([])
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
it('handles special numeric values', async () => {
|
|
|
|
|
const data = [
|
|
|
|
|
{ name: 'double', data: [NaN, Infinity, -Infinity, 42, 0, -0] },
|
|
|
|
|
]
|
|
|
|
|
const result = await roundTripDeserialize(data)
|
|
|
|
|
expect(result[0].double).toBeNaN()
|
|
|
|
|
expect(result[1].double).toEqual(Infinity)
|
|
|
|
|
expect(result[2].double).toEqual(-Infinity)
|
|
|
|
|
expect(result[3].double).toEqual(42)
|
|
|
|
|
expect(result[4].double).toEqual(0)
|
|
|
|
|
expect(result[5].double).toEqual(-0)
|
|
|
|
|
expect(result[5].double).not.toEqual(0)
|
|
|
|
|
})
|
|
|
|
|
|
2025-04-03 07:42:54 +00:00
|
|
|
it('splits row groups', async () => {
|
|
|
|
|
const data = Array(200).fill(13)
|
2025-04-08 10:22:30 +00:00
|
|
|
const file = parquetWriteBuffer({ columnData: [{ name: 'int', data }], rowGroupSize: 100 })
|
2025-04-03 07:42:54 +00:00
|
|
|
const metadata = parquetMetadata(file)
|
|
|
|
|
expect(metadata.row_groups.length).toBe(2)
|
|
|
|
|
expect(metadata.row_groups[0].num_rows).toBe(100n)
|
|
|
|
|
expect(metadata.row_groups[1].num_rows).toBe(100n)
|
|
|
|
|
// round trip
|
|
|
|
|
const result = await parquetReadObjects({ file })
|
|
|
|
|
expect(result.length).toBe(200)
|
|
|
|
|
expect(result[0]).toEqual({ int: 13 })
|
|
|
|
|
expect(result[99]).toEqual({ int: 13 })
|
|
|
|
|
expect(result[100]).toEqual({ int: 13 })
|
|
|
|
|
expect(result[199]).toEqual({ int: 13 })
|
|
|
|
|
})
|
|
|
|
|
|
2025-03-28 23:13:27 +00:00
|
|
|
it('throws for wrong type specified', () => {
|
2025-04-08 10:22:30 +00:00
|
|
|
expect(() => parquetWriteBuffer({ columnData: [{ name: 'int', data: [1, 2, 3], type: 'BOOLEAN' }] }))
|
2025-04-11 08:41:56 +00:00
|
|
|
.toThrow('parquet expected boolean value')
|
2025-03-28 23:13:27 +00:00
|
|
|
})
|
|
|
|
|
|
|
|
|
|
it('throws for empty column with no type specified', () => {
|
2025-04-08 10:22:30 +00:00
|
|
|
expect(() => parquetWriteBuffer({ columnData: [{ name: 'empty', data: [] }] }))
|
2025-03-28 23:13:27 +00:00
|
|
|
.toThrow('column empty cannot determine type')
|
2025-04-08 10:22:30 +00:00
|
|
|
expect(() => parquetWriteBuffer({ columnData: [{ name: 'empty', data: [null, null, null, null] }] }))
|
2025-03-28 23:13:27 +00:00
|
|
|
.toThrow('column empty cannot determine type')
|
|
|
|
|
})
|
|
|
|
|
|
2025-03-26 07:11:14 +00:00
|
|
|
it('throws for mixed types', () => {
|
2025-04-08 10:22:30 +00:00
|
|
|
expect(() => parquetWriteBuffer({ columnData: [{ name: 'mixed', data: [1, 2, 3, 'boom'] }] }))
|
2025-03-26 07:45:22 +00:00
|
|
|
.toThrow('mixed types not supported')
|
2025-03-26 07:11:14 +00:00
|
|
|
})
|
2025-03-29 19:28:25 +00:00
|
|
|
|
|
|
|
|
it('throws error when columns have mismatched lengths', () => {
|
2025-04-08 10:22:30 +00:00
|
|
|
expect(() => parquetWriteBuffer({ columnData: [
|
2025-03-29 19:28:25 +00:00
|
|
|
{ name: 'col1', data: [1, 2, 3] },
|
|
|
|
|
{ name: 'col2', data: [4, 5] },
|
|
|
|
|
] })).toThrow('columns must have the same length')
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
it('throws error for unsupported data types', () => {
|
2025-04-08 10:22:30 +00:00
|
|
|
expect(() => parquetWriteBuffer({ columnData: [{ name: 'func', data: [() => {}] }] }))
|
2025-03-29 19:28:25 +00:00
|
|
|
.toThrow('cannot determine parquet type for: () => {}')
|
|
|
|
|
})
|
2025-03-26 04:06:43 +00:00
|
|
|
})
|