2024-01-15 19:08:48 +00:00
|
|
|
import { describe, expect, it } from 'vitest'
|
2024-08-20 18:30:39 +00:00
|
|
|
import { parquetRead, parquetReadObjects } from '../src/hyparquet.js'
|
2024-07-26 22:01:01 +00:00
|
|
|
import { asyncBufferFromFile, toJson } from '../src/utils.js'
|
2024-02-23 18:25:06 +00:00
|
|
|
|
2024-02-17 00:07:09 +00:00
|
|
|
describe('parquetRead', () => {
|
2024-04-12 20:09:31 +00:00
|
|
|
it('throws error for undefined file', async () => {
|
2024-04-26 19:52:42 +00:00
|
|
|
// @ts-expect-error testing invalid input
|
|
|
|
|
await expect(parquetRead({ file: undefined }))
|
2025-03-04 17:38:39 +00:00
|
|
|
.rejects.toThrow('parquetRead expected file AsyncBuffer')
|
2024-04-05 18:08:10 +00:00
|
|
|
})
|
|
|
|
|
|
2024-04-18 00:45:15 +00:00
|
|
|
it('throws error for undefined byteLength', async () => {
|
|
|
|
|
const file = { byteLength: undefined, slice: () => new ArrayBuffer(0) }
|
2024-04-26 19:52:42 +00:00
|
|
|
// @ts-expect-error testing invalid input
|
2024-04-18 00:45:15 +00:00
|
|
|
await expect(parquetRead({ file }))
|
2025-03-04 17:38:39 +00:00
|
|
|
.rejects.toThrow('parquetRead expected file AsyncBuffer')
|
2024-04-18 00:45:15 +00:00
|
|
|
})
|
|
|
|
|
|
2024-04-12 20:09:31 +00:00
|
|
|
it('filter by row', async () => {
|
2024-07-26 22:01:01 +00:00
|
|
|
const file = await asyncBufferFromFile('test/files/rowgroups.parquet')
|
2024-02-27 03:33:38 +00:00
|
|
|
await parquetRead({
|
2024-04-12 20:09:31 +00:00
|
|
|
file,
|
2024-06-08 02:30:30 +00:00
|
|
|
rowStart: 2,
|
|
|
|
|
rowEnd: 4,
|
2024-04-12 20:09:31 +00:00
|
|
|
onComplete: rows => {
|
2024-06-08 02:30:30 +00:00
|
|
|
expect(toJson(rows)).toEqual([[3], [4]])
|
|
|
|
|
},
|
|
|
|
|
})
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
it('filter by row overestimate', async () => {
|
2024-07-26 22:01:01 +00:00
|
|
|
const file = await asyncBufferFromFile('test/files/rowgroups.parquet')
|
2024-06-08 02:30:30 +00:00
|
|
|
await parquetRead({
|
|
|
|
|
file,
|
|
|
|
|
rowEnd: 100,
|
|
|
|
|
onComplete: rows => {
|
|
|
|
|
expect(toJson(rows)).toEqual([[1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14], [15]])
|
2024-04-12 20:09:31 +00:00
|
|
|
},
|
|
|
|
|
})
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
it('read a single column', async () => {
|
2024-07-26 22:01:01 +00:00
|
|
|
const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
|
2024-04-12 20:09:31 +00:00
|
|
|
await parquetRead({
|
|
|
|
|
file,
|
2024-03-14 22:39:00 +00:00
|
|
|
columns: ['c'],
|
2024-04-12 20:09:31 +00:00
|
|
|
onChunk: chunk => {
|
|
|
|
|
expect(toJson(chunk)).toEqual({
|
2024-03-14 22:39:00 +00:00
|
|
|
columnName: 'c',
|
|
|
|
|
columnData: [2, 3, 4, 5, 2],
|
2024-02-27 03:33:38 +00:00
|
|
|
rowStart: 0,
|
|
|
|
|
rowEnd: 5,
|
|
|
|
|
})
|
|
|
|
|
},
|
|
|
|
|
onComplete: (rows) => {
|
|
|
|
|
expect(toJson(rows)).toEqual([
|
2024-03-12 02:35:57 +00:00
|
|
|
[2],
|
|
|
|
|
[3],
|
|
|
|
|
[4],
|
|
|
|
|
[5],
|
|
|
|
|
[2],
|
2024-02-27 03:33:38 +00:00
|
|
|
])
|
|
|
|
|
},
|
|
|
|
|
})
|
|
|
|
|
})
|
2024-03-14 23:39:03 +00:00
|
|
|
|
2024-04-12 20:09:31 +00:00
|
|
|
it('read a list-like column', async () => {
|
2024-07-26 22:01:01 +00:00
|
|
|
const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
|
2024-03-14 23:39:03 +00:00
|
|
|
await parquetRead({
|
2024-04-12 20:09:31 +00:00
|
|
|
file,
|
2024-03-14 23:39:03 +00:00
|
|
|
columns: ['e'],
|
2024-04-12 20:09:31 +00:00
|
|
|
onChunk: chunk => {
|
|
|
|
|
expect(toJson(chunk)).toEqual({
|
2024-03-14 23:39:03 +00:00
|
|
|
columnName: 'e',
|
|
|
|
|
columnData: [[1, 2, 3], null, null, [1, 2, 3], [1, 2]],
|
|
|
|
|
rowStart: 0,
|
|
|
|
|
rowEnd: 5,
|
|
|
|
|
})
|
|
|
|
|
},
|
2024-04-12 20:09:31 +00:00
|
|
|
onComplete: rows => {
|
2024-03-14 23:39:03 +00:00
|
|
|
expect(toJson(rows)).toEqual([
|
2024-03-12 02:35:57 +00:00
|
|
|
[[1, 2, 3]],
|
|
|
|
|
[null],
|
|
|
|
|
[null],
|
|
|
|
|
[[1, 2, 3]],
|
|
|
|
|
[[1, 2]],
|
|
|
|
|
])
|
|
|
|
|
},
|
|
|
|
|
})
|
|
|
|
|
})
|
|
|
|
|
|
2024-04-12 20:09:31 +00:00
|
|
|
it('read a map-like column', async () => {
|
2024-07-26 22:01:01 +00:00
|
|
|
const file = await asyncBufferFromFile('test/files/nullable.impala.parquet')
|
2024-03-12 02:35:57 +00:00
|
|
|
await parquetRead({
|
2024-04-12 20:09:31 +00:00
|
|
|
file,
|
2024-03-12 02:35:57 +00:00
|
|
|
columns: ['int_map'],
|
2024-04-12 20:09:31 +00:00
|
|
|
onChunk: chunk => {
|
|
|
|
|
expect(toJson(chunk)).toEqual({
|
2024-03-12 02:35:57 +00:00
|
|
|
columnName: 'int_map',
|
|
|
|
|
columnData: [
|
|
|
|
|
{ k1: 1, k2: 100 },
|
2024-03-19 06:54:58 +00:00
|
|
|
{ k1: 2, k2: null },
|
2024-03-12 02:35:57 +00:00
|
|
|
{ },
|
2024-03-19 06:54:58 +00:00
|
|
|
{ },
|
|
|
|
|
{ },
|
|
|
|
|
null,
|
|
|
|
|
{ k1: null, k3: null },
|
2024-03-12 02:35:57 +00:00
|
|
|
],
|
|
|
|
|
rowStart: 0,
|
2024-03-19 06:54:58 +00:00
|
|
|
rowEnd: 7,
|
2024-03-12 02:35:57 +00:00
|
|
|
})
|
|
|
|
|
},
|
2024-04-12 20:09:31 +00:00
|
|
|
onComplete: rows => {
|
2024-03-12 02:35:57 +00:00
|
|
|
expect(toJson(rows)).toEqual([
|
|
|
|
|
[{ k1: 1, k2: 100 }],
|
2024-03-19 06:54:58 +00:00
|
|
|
[{ k1: 2, k2: null }],
|
|
|
|
|
[{ }],
|
|
|
|
|
[{ }],
|
|
|
|
|
[{ }],
|
|
|
|
|
[null],
|
|
|
|
|
[{ k1: null, k3: null }],
|
2024-03-14 23:39:03 +00:00
|
|
|
])
|
|
|
|
|
},
|
|
|
|
|
})
|
|
|
|
|
})
|
2024-08-13 16:15:59 +00:00
|
|
|
|
|
|
|
|
it('format row as object', async () => {
|
|
|
|
|
const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
|
|
|
|
|
await parquetRead({
|
|
|
|
|
file,
|
|
|
|
|
columns: ['c'],
|
|
|
|
|
rowFormat: 'object',
|
|
|
|
|
onComplete: (rows) => {
|
|
|
|
|
expect(toJson(rows)).toEqual([
|
|
|
|
|
{ c: 2 },
|
|
|
|
|
{ c: 3 },
|
|
|
|
|
{ c: 4 },
|
|
|
|
|
{ c: 5 },
|
|
|
|
|
{ c: 2 },
|
|
|
|
|
])
|
|
|
|
|
},
|
|
|
|
|
})
|
|
|
|
|
})
|
2024-08-14 07:01:47 +00:00
|
|
|
|
|
|
|
|
it('read columns out of order', async () => {
|
|
|
|
|
const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
|
|
|
|
|
await parquetRead({
|
|
|
|
|
file,
|
|
|
|
|
columns: ['c', 'missing', 'b', 'c'],
|
|
|
|
|
onComplete: (rows) => {
|
|
|
|
|
expect(toJson(rows)).toEqual([
|
|
|
|
|
[2, null, 1, 2],
|
|
|
|
|
[3, null, 2, 3],
|
|
|
|
|
[4, null, 3, 4],
|
|
|
|
|
[5, null, 4, 5],
|
|
|
|
|
[2, null, 5, 2],
|
|
|
|
|
])
|
|
|
|
|
},
|
|
|
|
|
})
|
|
|
|
|
})
|
2024-08-20 18:30:39 +00:00
|
|
|
|
|
|
|
|
it('read objects and return a promise', async () => {
|
|
|
|
|
const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
|
|
|
|
|
const rows = await parquetReadObjects({ file })
|
|
|
|
|
expect(toJson(rows)).toEqual([
|
|
|
|
|
{ a: 'abc', b: 1, c: 2, d: true, e: [1, 2, 3] },
|
|
|
|
|
{ a: 'abc', b: 2, c: 3, d: true },
|
|
|
|
|
{ a: 'abc', b: 3, c: 4, d: true },
|
|
|
|
|
{ a: null, b: 4, c: 5, d: false, e: [1, 2, 3] },
|
|
|
|
|
{ a: 'abc', b: 5, c: 2, d: true, e: [1, 2] },
|
|
|
|
|
])
|
|
|
|
|
})
|
2024-01-15 19:08:48 +00:00
|
|
|
})
|