2024-12-20 02:08:22 +00:00
|
|
|
import { describe, expect, it } from 'vitest'
|
2025-04-30 07:49:40 +00:00
|
|
|
import { readColumn } from '../src/column.js'
|
2025-06-10 01:02:31 +00:00
|
|
|
import { DEFAULT_PARSERS } from '../src/convert.js'
|
2025-05-30 22:47:02 +00:00
|
|
|
import { parquetMetadata } from '../src/index.js'
|
2025-05-30 20:01:20 +00:00
|
|
|
import { asyncBufferFromFile } from '../src/node.js'
|
2025-04-30 07:49:40 +00:00
|
|
|
import { getColumnRange } from '../src/plan.js'
|
2024-12-20 02:08:22 +00:00
|
|
|
import { getSchemaPath } from '../src/schema.js'
|
|
|
|
|
|
2024-12-20 08:53:56 +00:00
|
|
|
const values = [null, 1, -2, NaN, 0, -1, -0, 2]
|
2024-12-20 02:08:22 +00:00
|
|
|
|
2024-12-20 08:53:56 +00:00
|
|
|
describe('readColumn', () => {
|
|
|
|
|
it.for([
|
2025-04-10 22:51:24 +00:00
|
|
|
{ selectEnd: Infinity, expected: [values] },
|
|
|
|
|
{ selectEnd: 2, expected: [values.slice(0, 2)] },
|
|
|
|
|
{ selectEnd: 0, expected: [] },
|
|
|
|
|
])('readColumn with rowGroupEnd %p', async ({ selectEnd, expected }) => {
|
2024-12-20 02:08:22 +00:00
|
|
|
const testFile = 'test/files/float16_nonzeros_and_nans.parquet'
|
2025-04-10 23:27:25 +00:00
|
|
|
const file = await asyncBufferFromFile(testFile)
|
|
|
|
|
const arrayBuffer = await file.slice(0)
|
2024-12-20 02:08:22 +00:00
|
|
|
const metadata = parquetMetadata(arrayBuffer)
|
|
|
|
|
|
|
|
|
|
const column = metadata.row_groups[0].columns[0]
|
|
|
|
|
if (!column.meta_data) throw new Error(`No column metadata for ${testFile}`)
|
2025-04-30 07:49:40 +00:00
|
|
|
const { startByte, endByte } = getColumnRange(column.meta_data)
|
|
|
|
|
const columnArrayBuffer = arrayBuffer.slice(startByte, endByte)
|
2024-12-20 02:08:22 +00:00
|
|
|
const schemaPath = getSchemaPath(metadata.schema, column.meta_data?.path_in_schema ?? [])
|
|
|
|
|
const reader = { view: new DataView(columnArrayBuffer), offset: 0 }
|
2025-04-11 02:18:06 +00:00
|
|
|
const columnDecoder = {
|
|
|
|
|
columnName: column.meta_data.path_in_schema.join('.'),
|
|
|
|
|
type: column.meta_data.type,
|
|
|
|
|
element: schemaPath[schemaPath.length - 1].element,
|
|
|
|
|
schemaPath,
|
2025-06-10 01:02:31 +00:00
|
|
|
parsers: DEFAULT_PARSERS,
|
2025-04-11 02:18:06 +00:00
|
|
|
codec: column.meta_data.codec,
|
|
|
|
|
}
|
2025-04-10 22:51:24 +00:00
|
|
|
const rowGroupSelect = {
|
|
|
|
|
groupStart: 0,
|
|
|
|
|
selectStart: 0,
|
|
|
|
|
selectEnd,
|
2025-05-25 22:21:58 +00:00
|
|
|
groupRows: expected.length,
|
2025-04-10 22:51:24 +00:00
|
|
|
}
|
2024-12-20 02:08:22 +00:00
|
|
|
|
2025-04-10 22:51:24 +00:00
|
|
|
const result = readColumn(reader, rowGroupSelect, columnDecoder)
|
2024-12-20 02:08:22 +00:00
|
|
|
expect(result).toEqual(expected)
|
|
|
|
|
})
|
2025-03-11 06:33:47 +00:00
|
|
|
|
|
|
|
|
it('readColumn should return a typed array', async () => {
|
|
|
|
|
const testFile = 'test/files/datapage_v2.snappy.parquet'
|
2025-04-10 23:27:25 +00:00
|
|
|
const file = await asyncBufferFromFile(testFile)
|
|
|
|
|
const arrayBuffer = await file.slice(0)
|
2025-03-11 06:33:47 +00:00
|
|
|
const metadata = parquetMetadata(arrayBuffer)
|
|
|
|
|
|
|
|
|
|
const column = metadata.row_groups[0].columns[1] // second column
|
|
|
|
|
if (!column.meta_data) throw new Error(`No column metadata for ${testFile}`)
|
2025-04-30 07:49:40 +00:00
|
|
|
const { startByte, endByte } = getColumnRange(column.meta_data)
|
|
|
|
|
const columnArrayBuffer = arrayBuffer.slice(startByte, endByte)
|
2025-03-11 06:33:47 +00:00
|
|
|
const schemaPath = getSchemaPath(metadata.schema, column.meta_data?.path_in_schema ?? [])
|
|
|
|
|
const reader = { view: new DataView(columnArrayBuffer), offset: 0 }
|
2025-04-11 02:18:06 +00:00
|
|
|
const columnDecoder = {
|
|
|
|
|
columnName: column.meta_data.path_in_schema.join('.'),
|
|
|
|
|
type: column.meta_data.type,
|
|
|
|
|
element: schemaPath[schemaPath.length - 1].element,
|
|
|
|
|
schemaPath,
|
2025-06-10 01:02:31 +00:00
|
|
|
parsers: DEFAULT_PARSERS,
|
2025-04-11 02:18:06 +00:00
|
|
|
codec: column.meta_data.codec,
|
|
|
|
|
}
|
2025-04-10 22:51:24 +00:00
|
|
|
const rowGroupSelect = {
|
|
|
|
|
groupStart: 0,
|
|
|
|
|
selectStart: 0,
|
|
|
|
|
selectEnd: Infinity,
|
2025-05-25 22:21:58 +00:00
|
|
|
groupRows: Number(column.meta_data.num_values),
|
2025-04-10 22:51:24 +00:00
|
|
|
}
|
2025-03-11 06:33:47 +00:00
|
|
|
|
2025-04-10 22:51:24 +00:00
|
|
|
const columnData = readColumn(reader, rowGroupSelect, columnDecoder)
|
2025-03-11 06:33:47 +00:00
|
|
|
expect(columnData[0]).toBeInstanceOf(Int32Array)
|
|
|
|
|
})
|
2024-12-20 02:08:22 +00:00
|
|
|
})
|