Test metadata of all sample files

This commit is contained in:
Kenny Daniel 2024-02-13 10:52:29 -08:00
parent 514014d08b
commit 2cf00da9aa
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
5 changed files with 167 additions and 163 deletions

@ -28,7 +28,7 @@
},
"devDependencies": {
"@types/node": "20.11.17",
"@typescript-eslint/eslint-plugin": "6.21.0",
"@typescript-eslint/eslint-plugin": "7.0.1",
"@vitest/coverage-v8": "1.2.2",
"eslint": "8.56.0",
"eslint-plugin-import": "2.29.1",

@ -0,0 +1,47 @@
{
"version": 1,
"created_by": "DuckDB",
"metadata_length": 149,
"schema": [
{
"repetition_type": "REQUIRED",
"name": "duckdb_schema",
"num_children": 1
},
{
"type": 6,
"repetition_type": "OPTIONAL",
"name": "ADDRTYPE",
"converted_type": "UTF8"
}
],
"num_rows": 10,
"row_groups": [
{
"columns": [
{
"file_offset": 0,
"meta_data": {
"type": 6,
"encodings": [0, 8],
"path_in_schema": ["ADDRTYPE"],
"codec": "SNAPPY",
"num_values": 10,
"total_uncompressed_size": 78,
"total_compressed_size": 82,
"data_page_offset": 31,
"dictionary_page_offset": 4,
"statistics": {
"max": "Intersection",
"min": "Block",
"null_count": 1,
"distinct_count": 2
}
}
}
],
"total_byte_size": 33024,
"num_rows": 10
}
]
}

@ -0,0 +1,86 @@
{
"version": 2,
"created_by": "parquet-cpp-arrow version 14.0.2",
"metadata_length": 1602,
"schema": [
{
"repetition_type": "REQUIRED",
"name": "schema",
"num_children": 1
},
{
"type": 2,
"repetition_type": "OPTIONAL",
"name": "numbers"
}
],
"num_rows": 15,
"row_groups": [
{
"columns": [
{
"file_offset": 150,
"meta_data": {
"codec": "SNAPPY",
"data_page_offset": 71,
"dictionary_page_offset": 4,
"encoding_stats": [
{ "count": 1, "encoding": 0, "page_type": 2 },
{ "count": 1, "encoding": 8, "page_type": 0 }
],
"encodings": [0, 3, 8],
"num_values": 10,
"path_in_schema": ["numbers"],
"statistics": {
"max": "\n\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
"min": "\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
"null_count": 0
},
"total_compressed_size": 146,
"total_uncompressed_size": 172,
"type": 2
}
}
],
"total_byte_size": 172,
"num_rows": 10
},
{
"columns": [
{
"file_offset": 368,
"meta_data": {
"codec": "SNAPPY",
"data_page_offset": 294,
"dictionary_page_offset": 248,
"encoding_stats": [
{ "count": 1, "encoding": 0, "page_type": 2 },
{ "count": 1, "encoding": 8, "page_type": 0 }
],
"encodings": [0, 3, 8],
"num_values": 5,
"path_in_schema": ["numbers"],
"statistics": {
"max": "\u000f\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
"min": "\u000b\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
"null_count": 0
},
"total_compressed_size": 120,
"total_uncompressed_size": 126,
"type": 2
}
}
],
"total_byte_size": 126,
"num_rows": 5
}
],
"key_value_metadata": [
{
"key": "pandas"
},
{
"key": "ARROW:schema"
}
]
}

@ -24,3 +24,14 @@ export function fileToAsyncBuffer(filePath) {
slice: async (start, end) => (await readFileToArrayBuffer(filePath)).slice(start, end),
}
}
/**
* Read .parquet file into JSON
*
* @param {string} filePath
* @returns {any}
*/
export function fileToJson(filePath) {
const buffer = fs.readFileSync(filePath)
return JSON.parse(buffer.toString())
}

@ -1,19 +1,20 @@
import fs from 'fs'
import { describe, expect, it } from 'vitest'
import { parquetMetadata, parquetMetadataAsync } from '../src/hyparquet.js'
import { toJson } from '../src/toJson.js'
import { fileToAsyncBuffer, readFileToArrayBuffer } from './helpers.js'
import { fileToAsyncBuffer, fileToJson, readFileToArrayBuffer } from './helpers.js'
describe('parquetMetadata', () => {
it('should parse metadata from addrtype-missing-value.parquet', async () => {
const arrayBuffer = await readFileToArrayBuffer('test/files/addrtype-missing-value.parquet')
const result = parquetMetadata(arrayBuffer)
expect(toJson(result)).toEqual(addrtypeMetadata)
})
it('should parse metadata from rowgroups.parquet', async () => {
const arrayBuffer = await readFileToArrayBuffer('test/files/rowgroups.parquet')
const result = parquetMetadata(arrayBuffer)
expect(toJson(result)).containSubset(rowgroupsMetadata)
it('should parse metadata from all test files', async () => {
const files = fs.readdirSync('test/files')
for (const file of files) {
if (!file.endsWith('.parquet')) continue
const arrayBuffer = await readFileToArrayBuffer(`test/files/${file}`)
const result = parquetMetadata(arrayBuffer)
const base = file.replace('.parquet', '')
const expected = fileToJson(`test/files/${base}.metadata.json`)
expect(toJson(result)).containSubset(expected)
}
})
it('should throw an error for a too short file', () => {
@ -45,17 +46,16 @@ describe('parquetMetadata', () => {
})
describe('parquetMetadataAsync', () => {
it('should parse metadata asynchronously from addrtype-missing-value.parquet', async () => {
const asyncBuffer = fileToAsyncBuffer('test/files/addrtype-missing-value.parquet')
const result = await parquetMetadataAsync(asyncBuffer)
expect(toJson(result)).toEqual(addrtypeMetadata)
})
it('should parse metadata asynchronously from rowgroups.parquet', async () => {
const asyncBuffer = fileToAsyncBuffer('test/files/rowgroups.parquet')
// force two fetches
const result = await parquetMetadataAsync(asyncBuffer, 1609)
expect(toJson(result)).containSubset(rowgroupsMetadata)
it('should parse metadata asynchronously from all test files', async () => {
const files = fs.readdirSync('test/files')
for (const file of files) {
if (!file.endsWith('.parquet')) continue
const asyncBuffer = fileToAsyncBuffer(`test/files/${file}`)
const result = await parquetMetadataAsync(asyncBuffer)
const base = file.replace('.parquet', '')
const expected = fileToJson(`test/files/${base}.metadata.json`)
expect(toJson(result)).containSubset(expected)
}
})
it('should throw an error for invalid magic number', () => {
@ -70,143 +70,3 @@ describe('parquetMetadataAsync', () => {
.toThrow('parquet metadata length 4294967295 exceeds available buffer 0')
})
})
// Parquet v1 from DuckDB
const addrtypeMetadata = {
version: 1,
created_by: 'DuckDB',
metadata_length: 149,
schema: [
{
repetition_type: 'REQUIRED',
name: 'duckdb_schema',
num_children: 1,
},
{
type: 6,
repetition_type: 'OPTIONAL',
name: 'ADDRTYPE',
converted_type: 'UTF8',
},
],
num_rows: 10,
row_groups: [
{
columns: [
{
file_offset: 0,
meta_data: {
type: 6,
encodings: [0, 8],
path_in_schema: ['ADDRTYPE'],
codec: 'SNAPPY',
num_values: 10,
total_uncompressed_size: 78,
total_compressed_size: 82,
data_page_offset: 31,
dictionary_page_offset: 4,
statistics: {
max: 'Intersection',
min: 'Block',
null_count: 1,
distinct_count: 2,
},
},
},
],
total_byte_size: 33024,
num_rows: 10,
},
],
}
// Parquet v2 from pandas with 2 row groups
const rowgroupsMetadata = {
version: 2,
created_by: 'parquet-cpp-arrow version 14.0.2',
metadata_length: 1602,
schema: [
{
repetition_type: 'REQUIRED',
name: 'schema',
num_children: 1,
},
{
type: 2,
repetition_type: 'OPTIONAL',
name: 'numbers',
},
],
num_rows: 15,
row_groups: [
{
columns: [
{
file_offset: 150,
file_path: undefined,
meta_data: {
codec: 'SNAPPY',
data_page_offset: 71,
dictionary_page_offset: 4,
encoding_stats: [
{ count: 1, encoding: 0, page_type: 2 },
{ count: 1, encoding: 8, page_type: 0 },
],
encodings: [0, 3, 8],
num_values: 10,
path_in_schema: ['numbers'],
statistics: {
max: '\n\x00\x00\x00\x00\x00\x00\x00',
min: '\x01\x00\x00\x00\x00\x00\x00\x00',
null_count: 0,
},
total_compressed_size: 146,
total_uncompressed_size: 172,
type: 2,
},
},
],
total_byte_size: 172,
num_rows: 10,
},
{
columns: [
{
file_offset: 368,
meta_data: {
codec: 'SNAPPY',
data_page_offset: 294,
dictionary_page_offset: 248,
encoding_stats: [
{ count: 1, encoding: 0, page_type: 2 },
{ count: 1, encoding: 8, page_type: 0 },
],
encodings: [0, 3, 8],
num_values: 5,
path_in_schema: ['numbers'],
statistics: {
max: '\x0F\x00\x00\x00\x00\x00\x00\x00',
min: '\x0B\x00\x00\x00\x00\x00\x00\x00',
null_count: 0,
},
total_compressed_size: 120,
total_uncompressed_size: 126,
type: 2,
},
},
],
total_byte_size: 126,
num_rows: 5,
},
],
key_value_metadata: [
{
key: 'pandas',
// value: json
},
{
key: 'ARROW:schema',
// value: base64
},
],
}