2025-04-07 04:21:24 +00:00
|
|
|
import { describe, expect, it, vi } from 'vitest'
|
|
|
|
|
import { convertWithDictionary } from '../src/convert.js'
|
2025-05-30 22:47:02 +00:00
|
|
|
import { parquetMetadataAsync, parquetRead, parquetReadObjects } from '../src/index.js'
|
2025-05-30 20:01:20 +00:00
|
|
|
import { asyncBufferFromFile } from '../src/node.js'
|
2025-05-19 09:13:37 +00:00
|
|
|
import { countingBuffer } from './helpers.js'
|
2024-02-23 18:25:06 +00:00
|
|
|
|
2025-04-07 04:21:24 +00:00
|
|
|
vi.mock('../src/convert.js', { spy: true })
|
|
|
|
|
|
2024-02-17 00:07:09 +00:00
|
|
|
describe('parquetRead', () => {
|
2024-04-12 20:09:31 +00:00
|
|
|
it('throws error for undefined file', async () => {
|
2024-04-26 19:52:42 +00:00
|
|
|
// @ts-expect-error testing invalid input
|
|
|
|
|
await expect(parquetRead({ file: undefined }))
|
2025-05-26 00:43:26 +00:00
|
|
|
.rejects.toThrow('parquet expected AsyncBuffer')
|
2024-04-05 18:08:10 +00:00
|
|
|
})
|
|
|
|
|
|
2024-04-18 00:45:15 +00:00
|
|
|
it('throws error for undefined byteLength', async () => {
|
|
|
|
|
const file = { byteLength: undefined, slice: () => new ArrayBuffer(0) }
|
2024-04-26 19:52:42 +00:00
|
|
|
// @ts-expect-error testing invalid input
|
2024-04-18 00:45:15 +00:00
|
|
|
await expect(parquetRead({ file }))
|
2025-05-26 00:43:26 +00:00
|
|
|
.rejects.toThrow('parquet expected AsyncBuffer')
|
2024-04-18 00:45:15 +00:00
|
|
|
})
|
|
|
|
|
|
2024-04-12 20:09:31 +00:00
|
|
|
it('filter by row', async () => {
|
2024-07-26 22:01:01 +00:00
|
|
|
const file = await asyncBufferFromFile('test/files/rowgroups.parquet')
|
2024-02-27 03:33:38 +00:00
|
|
|
await parquetRead({
|
2024-04-12 20:09:31 +00:00
|
|
|
file,
|
2024-06-08 02:30:30 +00:00
|
|
|
rowStart: 2,
|
|
|
|
|
rowEnd: 4,
|
2025-03-11 02:32:31 +00:00
|
|
|
onComplete(rows) {
|
|
|
|
|
expect(rows).toEqual([[3n], [4n]])
|
2024-06-08 02:30:30 +00:00
|
|
|
},
|
|
|
|
|
})
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
it('filter by row overestimate', async () => {
|
2024-07-26 22:01:01 +00:00
|
|
|
const file = await asyncBufferFromFile('test/files/rowgroups.parquet')
|
2024-06-08 02:30:30 +00:00
|
|
|
await parquetRead({
|
|
|
|
|
file,
|
|
|
|
|
rowEnd: 100,
|
2025-03-11 02:32:31 +00:00
|
|
|
onComplete(rows) {
|
|
|
|
|
expect(rows).toEqual([
|
|
|
|
|
[1n], [2n], [3n], [4n], [5n], [6n], [7n], [8n], [9n], [10n], [11n], [12n], [13n], [14n], [15n],
|
|
|
|
|
])
|
2024-04-12 20:09:31 +00:00
|
|
|
},
|
|
|
|
|
})
|
|
|
|
|
})
|
|
|
|
|
|
2025-03-11 06:33:47 +00:00
|
|
|
it('read a single column as typed array', async () => {
|
2024-07-26 22:01:01 +00:00
|
|
|
const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
|
2024-04-12 20:09:31 +00:00
|
|
|
await parquetRead({
|
|
|
|
|
file,
|
2025-03-11 02:32:31 +00:00
|
|
|
columns: ['b'],
|
|
|
|
|
onChunk(chunk) {
|
|
|
|
|
expect(chunk).toEqual({
|
|
|
|
|
columnName: 'b',
|
2025-03-11 06:33:47 +00:00
|
|
|
columnData: new Int32Array([1, 2, 3, 4, 5]),
|
2024-02-27 03:33:38 +00:00
|
|
|
rowStart: 0,
|
|
|
|
|
rowEnd: 5,
|
|
|
|
|
})
|
2025-03-11 06:33:47 +00:00
|
|
|
expect(chunk.columnData).toBeInstanceOf(Int32Array)
|
2024-02-27 03:33:38 +00:00
|
|
|
},
|
|
|
|
|
})
|
|
|
|
|
})
|
2024-03-14 23:39:03 +00:00
|
|
|
|
2024-04-12 20:09:31 +00:00
|
|
|
it('read a list-like column', async () => {
|
2024-07-26 22:01:01 +00:00
|
|
|
const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
|
2024-03-14 23:39:03 +00:00
|
|
|
await parquetRead({
|
2024-04-12 20:09:31 +00:00
|
|
|
file,
|
2024-03-14 23:39:03 +00:00
|
|
|
columns: ['e'],
|
2025-03-11 02:32:31 +00:00
|
|
|
onChunk(chunk) {
|
|
|
|
|
expect(chunk).toEqual({
|
2024-03-14 23:39:03 +00:00
|
|
|
columnName: 'e',
|
2025-03-11 02:32:31 +00:00
|
|
|
columnData: [[1, 2, 3], undefined, undefined, [1, 2, 3], [1, 2]],
|
2024-03-14 23:39:03 +00:00
|
|
|
rowStart: 0,
|
|
|
|
|
rowEnd: 5,
|
|
|
|
|
})
|
|
|
|
|
},
|
2025-03-11 02:32:31 +00:00
|
|
|
onComplete(rows) {
|
|
|
|
|
expect(rows).toEqual([
|
2024-03-12 02:35:57 +00:00
|
|
|
[[1, 2, 3]],
|
2025-03-11 02:32:31 +00:00
|
|
|
[undefined],
|
|
|
|
|
[undefined],
|
2024-03-12 02:35:57 +00:00
|
|
|
[[1, 2, 3]],
|
|
|
|
|
[[1, 2]],
|
|
|
|
|
])
|
|
|
|
|
},
|
|
|
|
|
})
|
|
|
|
|
})
|
|
|
|
|
|
2024-04-12 20:09:31 +00:00
|
|
|
it('read a map-like column', async () => {
|
2024-07-26 22:01:01 +00:00
|
|
|
const file = await asyncBufferFromFile('test/files/nullable.impala.parquet')
|
2024-03-12 02:35:57 +00:00
|
|
|
await parquetRead({
|
2024-04-12 20:09:31 +00:00
|
|
|
file,
|
2024-03-12 02:35:57 +00:00
|
|
|
columns: ['int_map'],
|
2025-03-11 02:32:31 +00:00
|
|
|
onChunk(chunk) {
|
|
|
|
|
expect(chunk).toEqual({
|
2024-03-12 02:35:57 +00:00
|
|
|
columnName: 'int_map',
|
|
|
|
|
columnData: [
|
|
|
|
|
{ k1: 1, k2: 100 },
|
2024-03-19 06:54:58 +00:00
|
|
|
{ k1: 2, k2: null },
|
2024-03-12 02:35:57 +00:00
|
|
|
{ },
|
2024-03-19 06:54:58 +00:00
|
|
|
{ },
|
|
|
|
|
{ },
|
2025-03-11 02:32:31 +00:00
|
|
|
undefined,
|
2024-03-19 06:54:58 +00:00
|
|
|
{ k1: null, k3: null },
|
2024-03-12 02:35:57 +00:00
|
|
|
],
|
|
|
|
|
rowStart: 0,
|
2024-03-19 06:54:58 +00:00
|
|
|
rowEnd: 7,
|
2024-03-12 02:35:57 +00:00
|
|
|
})
|
|
|
|
|
},
|
2025-03-11 02:32:31 +00:00
|
|
|
onComplete(rows) {
|
|
|
|
|
expect(rows).toEqual([
|
2024-03-12 02:35:57 +00:00
|
|
|
[{ k1: 1, k2: 100 }],
|
2024-03-19 06:54:58 +00:00
|
|
|
[{ k1: 2, k2: null }],
|
|
|
|
|
[{ }],
|
|
|
|
|
[{ }],
|
|
|
|
|
[{ }],
|
2025-03-11 02:32:31 +00:00
|
|
|
[undefined],
|
2024-03-19 06:54:58 +00:00
|
|
|
[{ k1: null, k3: null }],
|
2024-03-14 23:39:03 +00:00
|
|
|
])
|
|
|
|
|
},
|
|
|
|
|
})
|
|
|
|
|
})
|
2024-08-13 16:15:59 +00:00
|
|
|
|
|
|
|
|
it('format row as object', async () => {
|
|
|
|
|
const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
|
|
|
|
|
await parquetRead({
|
|
|
|
|
file,
|
|
|
|
|
columns: ['c'],
|
|
|
|
|
rowFormat: 'object',
|
2025-03-11 02:32:31 +00:00
|
|
|
onComplete(rows) {
|
|
|
|
|
expect(rows).toEqual([
|
2024-08-13 16:15:59 +00:00
|
|
|
{ c: 2 },
|
|
|
|
|
{ c: 3 },
|
|
|
|
|
{ c: 4 },
|
|
|
|
|
{ c: 5 },
|
|
|
|
|
{ c: 2 },
|
|
|
|
|
])
|
|
|
|
|
},
|
|
|
|
|
})
|
|
|
|
|
})
|
2024-08-14 07:01:47 +00:00
|
|
|
|
|
|
|
|
it('read columns out of order', async () => {
|
|
|
|
|
const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
|
|
|
|
|
await parquetRead({
|
|
|
|
|
file,
|
|
|
|
|
columns: ['c', 'missing', 'b', 'c'],
|
2025-03-11 02:32:31 +00:00
|
|
|
onComplete(rows) {
|
|
|
|
|
expect(rows).toEqual([
|
|
|
|
|
[2, undefined, 1, 2],
|
|
|
|
|
[3, undefined, 2, 3],
|
|
|
|
|
[4, undefined, 3, 4],
|
|
|
|
|
[5, undefined, 4, 5],
|
|
|
|
|
[2, undefined, 5, 2],
|
2024-08-14 07:01:47 +00:00
|
|
|
])
|
|
|
|
|
},
|
|
|
|
|
})
|
|
|
|
|
})
|
2024-08-20 18:30:39 +00:00
|
|
|
|
|
|
|
|
it('read objects and return a promise', async () => {
|
|
|
|
|
const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
|
|
|
|
|
const rows = await parquetReadObjects({ file })
|
2025-03-11 02:32:31 +00:00
|
|
|
expect(rows).toEqual([
|
2024-08-20 18:30:39 +00:00
|
|
|
{ a: 'abc', b: 1, c: 2, d: true, e: [1, 2, 3] },
|
|
|
|
|
{ a: 'abc', b: 2, c: 3, d: true },
|
|
|
|
|
{ a: 'abc', b: 3, c: 4, d: true },
|
|
|
|
|
{ a: null, b: 4, c: 5, d: false, e: [1, 2, 3] },
|
|
|
|
|
{ a: 'abc', b: 5, c: 2, d: true, e: [1, 2] },
|
|
|
|
|
])
|
|
|
|
|
})
|
2025-04-07 04:21:24 +00:00
|
|
|
|
|
|
|
|
it('skips converting unnecessary pages', async () => {
|
|
|
|
|
const file = await asyncBufferFromFile('test/files/page_indexed.parquet')
|
|
|
|
|
const metadata = await parquetMetadataAsync(file)
|
|
|
|
|
vi.mocked(convertWithDictionary).mockClear()
|
|
|
|
|
const rows = await parquetReadObjects({
|
|
|
|
|
file,
|
|
|
|
|
metadata,
|
|
|
|
|
rowStart: 90,
|
|
|
|
|
rowEnd: 91,
|
|
|
|
|
})
|
2025-04-26 23:59:06 +00:00
|
|
|
expect(rows).toEqual([{ row: 90n, quality: 'bad' }])
|
|
|
|
|
expect(convertWithDictionary).toHaveBeenCalledTimes(4)
|
2025-04-07 04:21:24 +00:00
|
|
|
})
|
2025-04-11 06:29:58 +00:00
|
|
|
|
|
|
|
|
it('reads individual pages', async () => {
|
2025-04-26 23:59:06 +00:00
|
|
|
const file = countingBuffer(await asyncBufferFromFile('test/files/page_indexed.parquet'))
|
2025-05-19 09:13:37 +00:00
|
|
|
/** @type {import('../src/types.js').ColumnData[]} */
|
2025-04-11 06:29:58 +00:00
|
|
|
const pages = []
|
|
|
|
|
|
2025-04-30 07:49:40 +00:00
|
|
|
// check onPage callback
|
2025-04-11 06:29:58 +00:00
|
|
|
await parquetRead({
|
|
|
|
|
file,
|
|
|
|
|
onPage(page) {
|
|
|
|
|
pages.push(page)
|
|
|
|
|
},
|
|
|
|
|
})
|
|
|
|
|
|
2025-04-30 07:49:40 +00:00
|
|
|
const expectedPages = [
|
2025-04-11 06:29:58 +00:00
|
|
|
{
|
2025-04-26 23:59:06 +00:00
|
|
|
columnName: 'row',
|
|
|
|
|
columnData: Array.from({ length: 100 }, (_, i) => BigInt(i)),
|
|
|
|
|
rowStart: 0,
|
|
|
|
|
rowEnd: 100,
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
columnName: 'quality',
|
2025-04-11 06:29:58 +00:00
|
|
|
columnData: [
|
|
|
|
|
'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'good', 'bad', 'bad', 'bad',
|
|
|
|
|
'good', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad',
|
|
|
|
|
'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'good',
|
|
|
|
|
'bad', 'bad', 'good', 'bad', 'bad', 'bad', 'bad', 'good', 'bad', 'bad',
|
|
|
|
|
'bad', 'bad', 'good', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad',
|
|
|
|
|
'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'good', 'bad', 'good', 'bad',
|
|
|
|
|
'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'good', 'bad',
|
|
|
|
|
'bad', 'bad', 'good', 'bad', 'bad', 'bad', 'bad', 'good', 'bad', 'bad',
|
|
|
|
|
'bad', 'bad', 'bad', 'good', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad',
|
|
|
|
|
'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'good', 'bad',
|
|
|
|
|
],
|
|
|
|
|
rowStart: 0,
|
|
|
|
|
rowEnd: 100,
|
|
|
|
|
},
|
|
|
|
|
{
|
2025-04-26 23:59:06 +00:00
|
|
|
columnName: 'row',
|
|
|
|
|
columnData: Array.from({ length: 100 }, (_, i) => BigInt(i + 100)),
|
|
|
|
|
rowStart: 100,
|
|
|
|
|
rowEnd: 200,
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
columnName: 'quality',
|
2025-04-11 06:29:58 +00:00
|
|
|
columnData: [
|
|
|
|
|
'good', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'good',
|
|
|
|
|
'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad',
|
|
|
|
|
'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'good', 'bad',
|
|
|
|
|
'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad',
|
|
|
|
|
'bad', 'bad', 'bad', 'bad', 'bad', 'good', 'bad', 'bad', 'good', 'bad',
|
|
|
|
|
'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad',
|
|
|
|
|
'bad', 'bad', 'bad', 'bad', 'good', 'bad', 'bad', 'bad', 'good', 'bad',
|
|
|
|
|
'bad', 'good', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad',
|
|
|
|
|
'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad',
|
|
|
|
|
'good', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad', 'bad',
|
|
|
|
|
],
|
|
|
|
|
rowStart: 100,
|
|
|
|
|
rowEnd: 200,
|
|
|
|
|
},
|
2025-04-30 07:49:40 +00:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
// expect each page to exist in expected
|
|
|
|
|
for (const expected of expectedPages) {
|
|
|
|
|
const page = pages.find(p => p.columnName === expected.columnName && p.rowStart === expected.rowStart)
|
|
|
|
|
expect(page).toEqual(expected)
|
|
|
|
|
}
|
2025-04-26 23:59:06 +00:00
|
|
|
expect(file.fetches).toBe(3) // 1 metadata, 2 rowgroups
|
2025-05-19 09:13:37 +00:00
|
|
|
expect(file.bytes).toBe(6421)
|
2025-04-11 06:29:58 +00:00
|
|
|
})
|
2024-01-15 19:08:48 +00:00
|
|
|
})
|