Split out schemaTree tests

This commit is contained in:
Kenny Daniel 2024-01-20 12:17:11 -08:00
parent a40e678214
commit 8484426bc8
No known key found for this signature in database
GPG Key ID: 6A3C5E318BE71391
4 changed files with 92 additions and 107 deletions

26
test/helpers.js Normal file

@ -0,0 +1,26 @@
import fs from 'fs'
/**
* Helper function to read .parquet file into ArrayBuffer
*
* @param {string} filePath
* @returns {Promise<ArrayBuffer>}
*/
export async function readFileToArrayBuffer(filePath) {
const buffer = await fs.promises.readFile(filePath)
return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)
}
/**
* Wrap .parquet file in an AsyncBuffer
*
* @typedef {import('../src/types.js').AsyncBuffer} AsyncBuffer
* @param {string} filePath
* @returns {AsyncBuffer}
*/
export function fileToAsyncBuffer(filePath) {
return {
byteLength: fs.statSync(filePath).size,
slice: async (start, end) => (await readFileToArrayBuffer(filePath)).slice(start, end),
}
}

@ -1,32 +1,7 @@
import fs from 'fs'
import { describe, expect, it } from 'vitest'
import { parquetMetadata, parquetMetadataAsync, parquetSchema } from '../src/hyparquet.js'
import { parquetMetadata, parquetMetadataAsync } from '../src/hyparquet.js'
import { toJson } from '../src/toJson.js'
/**
* Helper function to read .parquet file into ArrayBuffer
*
* @param {string} filePath
* @returns {Promise<ArrayBuffer>}
*/
async function readFileToArrayBuffer(filePath) {
const buffer = await fs.promises.readFile(filePath)
return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)
}
/**
* Wrap .parquet file in an AsyncBuffer
*
* @typedef {import('../src/types.js').AsyncBuffer} AsyncBuffer
* @param {string} filePath
* @returns {AsyncBuffer}
*/
function fileToAsyncBuffer(filePath) {
return {
byteLength: fs.statSync(filePath).size,
slice: async (start, end) => (await readFileToArrayBuffer(filePath)).slice(start, end),
}
}
import { fileToAsyncBuffer, readFileToArrayBuffer } from './helpers.js'
describe('parquetMetadata', () => {
it('should parse metadata from addrtype-missing-value.parquet', async () => {
@ -77,59 +52,6 @@ describe('parquetMetadataAsync', () => {
})
})
describe('parquetSchema', () => {
it('should parse schema from addrtype-missing-value.parquet', async () => {
const arrayBuffer = await readFileToArrayBuffer('test/files/addrtype-missing-value.parquet')
const metadata = parquetMetadata(arrayBuffer)
const result = parquetSchema(metadata)
expect(toJson(result)).toEqual({
children: [
{
children: [],
count: 1,
element: {
converted_type: 0,
name: 'ADDRTYPE',
repetition_type: 1,
type: 6,
},
},
],
count: 2,
element: {
name: 'duckdb_schema',
num_children: 1,
repetition_type: 0,
},
})
})
it('should parse schema from rowgroups.parquet', async () => {
const arrayBuffer = await readFileToArrayBuffer('test/files/rowgroups.parquet')
const metadata = parquetMetadata(arrayBuffer)
const result = parquetSchema(metadata)
expect(toJson(result)).toEqual({
children: [
{
children: [],
count: 1,
element: {
name: 'numbers',
repetition_type: 1,
type: 2,
},
},
],
count: 2,
element: {
name: 'schema',
num_children: 1,
repetition_type: 0,
},
})
})
})
// Parquet v1 from DuckDB
const addrtypeMetadata = {
version: 1,

@ -1,32 +1,7 @@
import fs from 'fs'
import { describe, expect, it } from 'vitest'
import { parquetRead } from '../src/hyparquet.js'
import { toJson } from '../src/toJson.js'
/**
* Helper function to read .parquet file into ArrayBuffer
*
* @param {string} filePath
* @returns {Promise<ArrayBuffer>}
*/
async function readFileToArrayBuffer(filePath) {
const buffer = await fs.promises.readFile(filePath)
return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)
}
/**
* Wrap .parquet file in an AsyncBuffer
*
* @typedef {import('../src/types.js').AsyncBuffer} AsyncBuffer
* @param {string} filePath
* @returns {AsyncBuffer}
*/
function fileToAsyncBuffer(filePath) {
return {
byteLength: fs.statSync(filePath).size,
slice: async (start, end) => (await readFileToArrayBuffer(filePath)).slice(start, end),
}
}
import { fileToAsyncBuffer } from './helpers.js'
describe('parquetMetadataAsync', () => {
it('should parse data from addrtype-missing-value.parquet', async () => {
@ -34,7 +9,7 @@ describe('parquetMetadataAsync', () => {
await parquetRead({
file: asyncBuffer,
onComplete: (rows) => {
expect(toJson(rows)).toEqual(addrtypeData)
expect(rows).toEqual(addrtypeData)
},
})
})

62
test/schemaTree.test.js Normal file

@ -0,0 +1,62 @@
import { describe, expect, it } from 'vitest'
import { parquetMetadata, parquetSchema } from '../src/hyparquet.js'
import { readFileToArrayBuffer } from './helpers.js'
describe('schemaTree', () => {
it('should parse schema tree from addrtype-missing-value.parquet', async () => {
const arrayBuffer = await readFileToArrayBuffer('test/files/addrtype-missing-value.parquet')
const metadata = parquetMetadata(arrayBuffer)
const result = parquetSchema(metadata)
expect(result).toEqual(addrtypeSchema)
})
it('should parse schema tree from rowgroups.parquet', async () => {
const arrayBuffer = await readFileToArrayBuffer('test/files/rowgroups.parquet')
const metadata = parquetMetadata(arrayBuffer)
const result = parquetSchema(metadata)
expect(result).toEqual(rowgroupsSchema)
})
})
// Parquet v1 from DuckDB
const addrtypeSchema = {
children: [
{
children: [],
count: 1,
element: {
converted_type: 0,
name: 'ADDRTYPE',
repetition_type: 1,
type: 6,
},
},
],
count: 2,
element: {
name: 'duckdb_schema',
num_children: 1,
repetition_type: 0,
},
}
// Parquet v2 from pandas with 2 row groups
const rowgroupsSchema = {
children: [
{
children: [],
count: 1,
element: {
name: 'numbers',
repetition_type: 1,
type: 2,
},
},
],
count: 2,
element: {
name: 'schema',
num_children: 1,
repetition_type: 0,
},
}