mirror of
https://github.com/asadbek064/hyparquet.git
synced 2025-12-26 23:26:38 +00:00
Test metadata of all sample files
This commit is contained in:
parent
514014d08b
commit
2cf00da9aa
@ -28,7 +28,7 @@
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "20.11.17",
|
||||
"@typescript-eslint/eslint-plugin": "6.21.0",
|
||||
"@typescript-eslint/eslint-plugin": "7.0.1",
|
||||
"@vitest/coverage-v8": "1.2.2",
|
||||
"eslint": "8.56.0",
|
||||
"eslint-plugin-import": "2.29.1",
|
||||
|
||||
47
test/files/addrtype-missing-value.metadata.json
Normal file
47
test/files/addrtype-missing-value.metadata.json
Normal file
@ -0,0 +1,47 @@
|
||||
{
|
||||
"version": 1,
|
||||
"created_by": "DuckDB",
|
||||
"metadata_length": 149,
|
||||
"schema": [
|
||||
{
|
||||
"repetition_type": "REQUIRED",
|
||||
"name": "duckdb_schema",
|
||||
"num_children": 1
|
||||
},
|
||||
{
|
||||
"type": 6,
|
||||
"repetition_type": "OPTIONAL",
|
||||
"name": "ADDRTYPE",
|
||||
"converted_type": "UTF8"
|
||||
}
|
||||
],
|
||||
"num_rows": 10,
|
||||
"row_groups": [
|
||||
{
|
||||
"columns": [
|
||||
{
|
||||
"file_offset": 0,
|
||||
"meta_data": {
|
||||
"type": 6,
|
||||
"encodings": [0, 8],
|
||||
"path_in_schema": ["ADDRTYPE"],
|
||||
"codec": "SNAPPY",
|
||||
"num_values": 10,
|
||||
"total_uncompressed_size": 78,
|
||||
"total_compressed_size": 82,
|
||||
"data_page_offset": 31,
|
||||
"dictionary_page_offset": 4,
|
||||
"statistics": {
|
||||
"max": "Intersection",
|
||||
"min": "Block",
|
||||
"null_count": 1,
|
||||
"distinct_count": 2
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"total_byte_size": 33024,
|
||||
"num_rows": 10
|
||||
}
|
||||
]
|
||||
}
|
||||
86
test/files/rowgroups.metadata.json
Normal file
86
test/files/rowgroups.metadata.json
Normal file
@ -0,0 +1,86 @@
|
||||
{
|
||||
"version": 2,
|
||||
"created_by": "parquet-cpp-arrow version 14.0.2",
|
||||
"metadata_length": 1602,
|
||||
"schema": [
|
||||
{
|
||||
"repetition_type": "REQUIRED",
|
||||
"name": "schema",
|
||||
"num_children": 1
|
||||
},
|
||||
{
|
||||
"type": 2,
|
||||
"repetition_type": "OPTIONAL",
|
||||
"name": "numbers"
|
||||
}
|
||||
],
|
||||
"num_rows": 15,
|
||||
"row_groups": [
|
||||
{
|
||||
"columns": [
|
||||
{
|
||||
"file_offset": 150,
|
||||
"meta_data": {
|
||||
"codec": "SNAPPY",
|
||||
"data_page_offset": 71,
|
||||
"dictionary_page_offset": 4,
|
||||
"encoding_stats": [
|
||||
{ "count": 1, "encoding": 0, "page_type": 2 },
|
||||
{ "count": 1, "encoding": 8, "page_type": 0 }
|
||||
],
|
||||
"encodings": [0, 3, 8],
|
||||
"num_values": 10,
|
||||
"path_in_schema": ["numbers"],
|
||||
"statistics": {
|
||||
"max": "\n\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
|
||||
"min": "\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
|
||||
"null_count": 0
|
||||
},
|
||||
"total_compressed_size": 146,
|
||||
"total_uncompressed_size": 172,
|
||||
"type": 2
|
||||
}
|
||||
}
|
||||
],
|
||||
"total_byte_size": 172,
|
||||
"num_rows": 10
|
||||
},
|
||||
{
|
||||
"columns": [
|
||||
{
|
||||
"file_offset": 368,
|
||||
"meta_data": {
|
||||
"codec": "SNAPPY",
|
||||
"data_page_offset": 294,
|
||||
"dictionary_page_offset": 248,
|
||||
"encoding_stats": [
|
||||
{ "count": 1, "encoding": 0, "page_type": 2 },
|
||||
{ "count": 1, "encoding": 8, "page_type": 0 }
|
||||
],
|
||||
"encodings": [0, 3, 8],
|
||||
"num_values": 5,
|
||||
"path_in_schema": ["numbers"],
|
||||
"statistics": {
|
||||
"max": "\u000f\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
|
||||
"min": "\u000b\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
|
||||
"null_count": 0
|
||||
},
|
||||
"total_compressed_size": 120,
|
||||
"total_uncompressed_size": 126,
|
||||
"type": 2
|
||||
}
|
||||
}
|
||||
],
|
||||
"total_byte_size": 126,
|
||||
"num_rows": 5
|
||||
}
|
||||
],
|
||||
"key_value_metadata": [
|
||||
{
|
||||
"key": "pandas"
|
||||
},
|
||||
{
|
||||
"key": "ARROW:schema"
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -24,3 +24,14 @@ export function fileToAsyncBuffer(filePath) {
|
||||
slice: async (start, end) => (await readFileToArrayBuffer(filePath)).slice(start, end),
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read .parquet file into JSON
|
||||
*
|
||||
* @param {string} filePath
|
||||
* @returns {any}
|
||||
*/
|
||||
export function fileToJson(filePath) {
|
||||
const buffer = fs.readFileSync(filePath)
|
||||
return JSON.parse(buffer.toString())
|
||||
}
|
||||
|
||||
@ -1,19 +1,20 @@
|
||||
import fs from 'fs'
|
||||
import { describe, expect, it } from 'vitest'
|
||||
import { parquetMetadata, parquetMetadataAsync } from '../src/hyparquet.js'
|
||||
import { toJson } from '../src/toJson.js'
|
||||
import { fileToAsyncBuffer, readFileToArrayBuffer } from './helpers.js'
|
||||
import { fileToAsyncBuffer, fileToJson, readFileToArrayBuffer } from './helpers.js'
|
||||
|
||||
describe('parquetMetadata', () => {
|
||||
it('should parse metadata from addrtype-missing-value.parquet', async () => {
|
||||
const arrayBuffer = await readFileToArrayBuffer('test/files/addrtype-missing-value.parquet')
|
||||
const result = parquetMetadata(arrayBuffer)
|
||||
expect(toJson(result)).toEqual(addrtypeMetadata)
|
||||
})
|
||||
|
||||
it('should parse metadata from rowgroups.parquet', async () => {
|
||||
const arrayBuffer = await readFileToArrayBuffer('test/files/rowgroups.parquet')
|
||||
const result = parquetMetadata(arrayBuffer)
|
||||
expect(toJson(result)).containSubset(rowgroupsMetadata)
|
||||
it('should parse metadata from all test files', async () => {
|
||||
const files = fs.readdirSync('test/files')
|
||||
for (const file of files) {
|
||||
if (!file.endsWith('.parquet')) continue
|
||||
const arrayBuffer = await readFileToArrayBuffer(`test/files/${file}`)
|
||||
const result = parquetMetadata(arrayBuffer)
|
||||
const base = file.replace('.parquet', '')
|
||||
const expected = fileToJson(`test/files/${base}.metadata.json`)
|
||||
expect(toJson(result)).containSubset(expected)
|
||||
}
|
||||
})
|
||||
|
||||
it('should throw an error for a too short file', () => {
|
||||
@ -45,17 +46,16 @@ describe('parquetMetadata', () => {
|
||||
})
|
||||
|
||||
describe('parquetMetadataAsync', () => {
|
||||
it('should parse metadata asynchronously from addrtype-missing-value.parquet', async () => {
|
||||
const asyncBuffer = fileToAsyncBuffer('test/files/addrtype-missing-value.parquet')
|
||||
const result = await parquetMetadataAsync(asyncBuffer)
|
||||
expect(toJson(result)).toEqual(addrtypeMetadata)
|
||||
})
|
||||
|
||||
it('should parse metadata asynchronously from rowgroups.parquet', async () => {
|
||||
const asyncBuffer = fileToAsyncBuffer('test/files/rowgroups.parquet')
|
||||
// force two fetches
|
||||
const result = await parquetMetadataAsync(asyncBuffer, 1609)
|
||||
expect(toJson(result)).containSubset(rowgroupsMetadata)
|
||||
it('should parse metadata asynchronously from all test files', async () => {
|
||||
const files = fs.readdirSync('test/files')
|
||||
for (const file of files) {
|
||||
if (!file.endsWith('.parquet')) continue
|
||||
const asyncBuffer = fileToAsyncBuffer(`test/files/${file}`)
|
||||
const result = await parquetMetadataAsync(asyncBuffer)
|
||||
const base = file.replace('.parquet', '')
|
||||
const expected = fileToJson(`test/files/${base}.metadata.json`)
|
||||
expect(toJson(result)).containSubset(expected)
|
||||
}
|
||||
})
|
||||
|
||||
it('should throw an error for invalid magic number', () => {
|
||||
@ -70,143 +70,3 @@ describe('parquetMetadataAsync', () => {
|
||||
.toThrow('parquet metadata length 4294967295 exceeds available buffer 0')
|
||||
})
|
||||
})
|
||||
|
||||
// Parquet v1 from DuckDB
|
||||
const addrtypeMetadata = {
|
||||
version: 1,
|
||||
created_by: 'DuckDB',
|
||||
metadata_length: 149,
|
||||
schema: [
|
||||
{
|
||||
repetition_type: 'REQUIRED',
|
||||
name: 'duckdb_schema',
|
||||
num_children: 1,
|
||||
},
|
||||
{
|
||||
type: 6,
|
||||
repetition_type: 'OPTIONAL',
|
||||
name: 'ADDRTYPE',
|
||||
converted_type: 'UTF8',
|
||||
},
|
||||
],
|
||||
num_rows: 10,
|
||||
row_groups: [
|
||||
{
|
||||
columns: [
|
||||
{
|
||||
file_offset: 0,
|
||||
meta_data: {
|
||||
type: 6,
|
||||
encodings: [0, 8],
|
||||
path_in_schema: ['ADDRTYPE'],
|
||||
codec: 'SNAPPY',
|
||||
num_values: 10,
|
||||
total_uncompressed_size: 78,
|
||||
total_compressed_size: 82,
|
||||
data_page_offset: 31,
|
||||
dictionary_page_offset: 4,
|
||||
statistics: {
|
||||
max: 'Intersection',
|
||||
min: 'Block',
|
||||
null_count: 1,
|
||||
distinct_count: 2,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
total_byte_size: 33024,
|
||||
num_rows: 10,
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
// Parquet v2 from pandas with 2 row groups
|
||||
const rowgroupsMetadata = {
|
||||
version: 2,
|
||||
created_by: 'parquet-cpp-arrow version 14.0.2',
|
||||
metadata_length: 1602,
|
||||
schema: [
|
||||
{
|
||||
repetition_type: 'REQUIRED',
|
||||
name: 'schema',
|
||||
num_children: 1,
|
||||
},
|
||||
{
|
||||
type: 2,
|
||||
repetition_type: 'OPTIONAL',
|
||||
name: 'numbers',
|
||||
},
|
||||
],
|
||||
num_rows: 15,
|
||||
row_groups: [
|
||||
{
|
||||
columns: [
|
||||
{
|
||||
file_offset: 150,
|
||||
file_path: undefined,
|
||||
meta_data: {
|
||||
codec: 'SNAPPY',
|
||||
data_page_offset: 71,
|
||||
dictionary_page_offset: 4,
|
||||
encoding_stats: [
|
||||
{ count: 1, encoding: 0, page_type: 2 },
|
||||
{ count: 1, encoding: 8, page_type: 0 },
|
||||
],
|
||||
encodings: [0, 3, 8],
|
||||
num_values: 10,
|
||||
path_in_schema: ['numbers'],
|
||||
statistics: {
|
||||
max: '\n\x00\x00\x00\x00\x00\x00\x00',
|
||||
min: '\x01\x00\x00\x00\x00\x00\x00\x00',
|
||||
null_count: 0,
|
||||
},
|
||||
total_compressed_size: 146,
|
||||
total_uncompressed_size: 172,
|
||||
type: 2,
|
||||
},
|
||||
},
|
||||
],
|
||||
total_byte_size: 172,
|
||||
num_rows: 10,
|
||||
},
|
||||
{
|
||||
columns: [
|
||||
{
|
||||
file_offset: 368,
|
||||
meta_data: {
|
||||
codec: 'SNAPPY',
|
||||
data_page_offset: 294,
|
||||
dictionary_page_offset: 248,
|
||||
encoding_stats: [
|
||||
{ count: 1, encoding: 0, page_type: 2 },
|
||||
{ count: 1, encoding: 8, page_type: 0 },
|
||||
],
|
||||
encodings: [0, 3, 8],
|
||||
num_values: 5,
|
||||
path_in_schema: ['numbers'],
|
||||
statistics: {
|
||||
max: '\x0F\x00\x00\x00\x00\x00\x00\x00',
|
||||
min: '\x0B\x00\x00\x00\x00\x00\x00\x00',
|
||||
null_count: 0,
|
||||
},
|
||||
total_compressed_size: 120,
|
||||
total_uncompressed_size: 126,
|
||||
type: 2,
|
||||
},
|
||||
},
|
||||
],
|
||||
total_byte_size: 126,
|
||||
num_rows: 5,
|
||||
},
|
||||
],
|
||||
key_value_metadata: [
|
||||
{
|
||||
key: 'pandas',
|
||||
// value: json
|
||||
},
|
||||
{
|
||||
key: 'ARROW:schema',
|
||||
// value: base64
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user