diff --git a/test/helpers.js b/test/helpers.js new file mode 100644 index 0000000..ccad905 --- /dev/null +++ b/test/helpers.js @@ -0,0 +1,26 @@ +import fs from 'fs' + +/** + * Helper function to read .parquet file into ArrayBuffer + * + * @param {string} filePath + * @returns {Promise} + */ +export async function readFileToArrayBuffer(filePath) { + const buffer = await fs.promises.readFile(filePath) + return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength) +} + +/** + * Wrap .parquet file in an AsyncBuffer + * + * @typedef {import('../src/types.js').AsyncBuffer} AsyncBuffer + * @param {string} filePath + * @returns {AsyncBuffer} + */ +export function fileToAsyncBuffer(filePath) { + return { + byteLength: fs.statSync(filePath).size, + slice: async (start, end) => (await readFileToArrayBuffer(filePath)).slice(start, end), + } +} diff --git a/test/metadata.test.js b/test/metadata.test.js index 427961d..9aa438d 100644 --- a/test/metadata.test.js +++ b/test/metadata.test.js @@ -1,32 +1,7 @@ -import fs from 'fs' import { describe, expect, it } from 'vitest' -import { parquetMetadata, parquetMetadataAsync, parquetSchema } from '../src/hyparquet.js' +import { parquetMetadata, parquetMetadataAsync } from '../src/hyparquet.js' import { toJson } from '../src/toJson.js' - -/** - * Helper function to read .parquet file into ArrayBuffer - * - * @param {string} filePath - * @returns {Promise} - */ -async function readFileToArrayBuffer(filePath) { - const buffer = await fs.promises.readFile(filePath) - return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength) -} - -/** - * Wrap .parquet file in an AsyncBuffer - * - * @typedef {import('../src/types.js').AsyncBuffer} AsyncBuffer - * @param {string} filePath - * @returns {AsyncBuffer} - */ -function fileToAsyncBuffer(filePath) { - return { - byteLength: fs.statSync(filePath).size, - slice: async (start, end) => (await readFileToArrayBuffer(filePath)).slice(start, end), - } -} +import { fileToAsyncBuffer, readFileToArrayBuffer } from './helpers.js' describe('parquetMetadata', () => { it('should parse metadata from addrtype-missing-value.parquet', async () => { @@ -77,59 +52,6 @@ describe('parquetMetadataAsync', () => { }) }) -describe('parquetSchema', () => { - it('should parse schema from addrtype-missing-value.parquet', async () => { - const arrayBuffer = await readFileToArrayBuffer('test/files/addrtype-missing-value.parquet') - const metadata = parquetMetadata(arrayBuffer) - const result = parquetSchema(metadata) - expect(toJson(result)).toEqual({ - children: [ - { - children: [], - count: 1, - element: { - converted_type: 0, - name: 'ADDRTYPE', - repetition_type: 1, - type: 6, - }, - }, - ], - count: 2, - element: { - name: 'duckdb_schema', - num_children: 1, - repetition_type: 0, - }, - }) - }) - - it('should parse schema from rowgroups.parquet', async () => { - const arrayBuffer = await readFileToArrayBuffer('test/files/rowgroups.parquet') - const metadata = parquetMetadata(arrayBuffer) - const result = parquetSchema(metadata) - expect(toJson(result)).toEqual({ - children: [ - { - children: [], - count: 1, - element: { - name: 'numbers', - repetition_type: 1, - type: 2, - }, - }, - ], - count: 2, - element: { - name: 'schema', - num_children: 1, - repetition_type: 0, - }, - }) - }) -}) - // Parquet v1 from DuckDB const addrtypeMetadata = { version: 1, diff --git a/test/read.test.js b/test/read.test.js index 7fb63a6..9159b9d 100644 --- a/test/read.test.js +++ b/test/read.test.js @@ -1,32 +1,7 @@ -import fs from 'fs' import { describe, expect, it } from 'vitest' import { parquetRead } from '../src/hyparquet.js' import { toJson } from '../src/toJson.js' - -/** - * Helper function to read .parquet file into ArrayBuffer - * - * @param {string} filePath - * @returns {Promise} - */ -async function readFileToArrayBuffer(filePath) { - const buffer = await fs.promises.readFile(filePath) - return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength) -} - -/** - * Wrap .parquet file in an AsyncBuffer - * - * @typedef {import('../src/types.js').AsyncBuffer} AsyncBuffer - * @param {string} filePath - * @returns {AsyncBuffer} - */ -function fileToAsyncBuffer(filePath) { - return { - byteLength: fs.statSync(filePath).size, - slice: async (start, end) => (await readFileToArrayBuffer(filePath)).slice(start, end), - } -} +import { fileToAsyncBuffer } from './helpers.js' describe('parquetMetadataAsync', () => { it('should parse data from addrtype-missing-value.parquet', async () => { @@ -34,7 +9,7 @@ describe('parquetMetadataAsync', () => { await parquetRead({ file: asyncBuffer, onComplete: (rows) => { - expect(toJson(rows)).toEqual(addrtypeData) + expect(rows).toEqual(addrtypeData) }, }) }) diff --git a/test/schemaTree.test.js b/test/schemaTree.test.js new file mode 100644 index 0000000..ce8a338 --- /dev/null +++ b/test/schemaTree.test.js @@ -0,0 +1,62 @@ +import { describe, expect, it } from 'vitest' +import { parquetMetadata, parquetSchema } from '../src/hyparquet.js' +import { readFileToArrayBuffer } from './helpers.js' + +describe('schemaTree', () => { + it('should parse schema tree from addrtype-missing-value.parquet', async () => { + const arrayBuffer = await readFileToArrayBuffer('test/files/addrtype-missing-value.parquet') + const metadata = parquetMetadata(arrayBuffer) + const result = parquetSchema(metadata) + expect(result).toEqual(addrtypeSchema) + }) + + it('should parse schema tree from rowgroups.parquet', async () => { + const arrayBuffer = await readFileToArrayBuffer('test/files/rowgroups.parquet') + const metadata = parquetMetadata(arrayBuffer) + const result = parquetSchema(metadata) + expect(result).toEqual(rowgroupsSchema) + }) +}) + +// Parquet v1 from DuckDB +const addrtypeSchema = { + children: [ + { + children: [], + count: 1, + element: { + converted_type: 0, + name: 'ADDRTYPE', + repetition_type: 1, + type: 6, + }, + }, + ], + count: 2, + element: { + name: 'duckdb_schema', + num_children: 1, + repetition_type: 0, + }, +} + +// Parquet v2 from pandas with 2 row groups +const rowgroupsSchema = { + children: [ + { + children: [], + count: 1, + element: { + name: 'numbers', + repetition_type: 1, + type: 2, + }, + }, + ], + count: 2, + element: { + name: 'schema', + num_children: 1, + repetition_type: 0, + }, +}