hyparquet-writer/test/metadata.test.js
2025-04-11 19:40:17 -06:00

244 lines
7.1 KiB
JavaScript

import { parquetMetadata } from 'hyparquet'
import { describe, expect, it } from 'vitest'
import { ByteWriter } from '../src/bytewriter.js'
import { logicalType, writeMetadata } from '../src/metadata.js'
/**
* @import {FileMetaData, LogicalType} from 'hyparquet'
* @import {ThriftObject} from '../src/types.js'
* @type {FileMetaData}
*/
export const exampleMetadata = {
version: 2,
created_by: 'hyparquet',
schema: [
{ name: 'root', num_children: 7 },
{ name: 'bool', type: 'BOOLEAN', repetition_type: 'REQUIRED' },
{ name: 'int', type: 'INT32', repetition_type: 'REQUIRED' },
{ name: 'bigint', type: 'INT64', repetition_type: 'REQUIRED' },
{ name: 'float', type: 'FLOAT', repetition_type: 'REQUIRED' },
{ name: 'double', type: 'DOUBLE', repetition_type: 'REQUIRED' },
{ name: 'string', type: 'BYTE_ARRAY', repetition_type: 'REQUIRED', converted_type: 'UTF8' },
{ name: 'nullable', type: 'BOOLEAN', repetition_type: 'OPTIONAL' },
],
num_rows: 4n,
row_groups: [{
columns: [
{
file_path: 'bool',
file_offset: 4n,
meta_data: {
type: 'BOOLEAN',
encodings: ['PLAIN'],
path_in_schema: ['bool'],
codec: 'SNAPPY',
num_values: 4n,
total_uncompressed_size: 24n,
total_compressed_size: 24n,
data_page_offset: 4n,
statistics: {
null_count: 0n,
min_value: false,
max_value: true,
},
},
},
{
file_path: 'int',
file_offset: 28n,
meta_data: {
type: 'INT32',
encodings: ['PLAIN'],
path_in_schema: ['int'],
codec: 'SNAPPY',
num_values: 4n,
total_uncompressed_size: 39n,
total_compressed_size: 39n,
data_page_offset: 28n,
statistics: {
null_count: 0n,
min_value: 0,
max_value: 0x7fffffff,
},
},
},
{
file_path: 'bigint',
file_offset: 67n,
meta_data: {
type: 'INT64',
encodings: ['PLAIN'],
path_in_schema: ['bigint'],
codec: 'SNAPPY',
num_values: 4n,
total_uncompressed_size: 43n,
total_compressed_size: 43n,
data_page_offset: 67n,
statistics: {
null_count: 0n,
min_value: 0n,
max_value: 0x7fffffffffffffffn,
},
},
},
{
file_path: 'float',
file_offset: 110n,
meta_data: {
type: 'FLOAT',
encodings: ['PLAIN'],
path_in_schema: ['float'],
codec: 'SNAPPY',
num_values: 4n,
total_uncompressed_size: 39n,
total_compressed_size: 39n,
data_page_offset: 110n,
statistics: {
null_count: 0n,
min_value: 0,
max_value: Infinity,
},
},
},
{
file_path: 'double',
file_offset: 149n,
meta_data: {
type: 'DOUBLE',
encodings: ['PLAIN'],
path_in_schema: ['double'],
codec: 'SNAPPY',
num_values: 4n,
total_uncompressed_size: 51n,
total_compressed_size: 51n,
data_page_offset: 149n,
statistics: {
null_count: 0n,
min_value: 0,
max_value: 1e100,
},
},
},
{
file_path: 'string',
file_offset: 200n,
meta_data: {
type: 'BYTE_ARRAY',
encodings: ['PLAIN'],
path_in_schema: ['string'],
codec: 'SNAPPY',
num_values: 4n,
total_uncompressed_size: 42n,
total_compressed_size: 42n,
data_page_offset: 200n,
statistics: {
null_count: 0n,
min_value: 'a',
max_value: 'd',
},
},
},
{
file_path: 'nullable',
file_offset: 242n,
meta_data: {
type: 'BOOLEAN',
encodings: ['PLAIN'],
path_in_schema: ['nullable'],
codec: 'SNAPPY',
num_values: 4n,
total_uncompressed_size: 26n,
total_compressed_size: 26n,
data_page_offset: 242n,
statistics: {
null_count: 2n,
min_value: false,
max_value: true,
},
},
},
],
total_byte_size: 264n,
num_rows: 4n,
}],
metadata_length: 497,
}
describe('writeMetadata', () => {
it('writes metadata and parses in hyparquet', () => {
const writer = new ByteWriter()
// write header PAR1
writer.appendUint32(0x31524150)
// write metadata
/** @type {FileMetaData} */
const withKvMetadata = {
...exampleMetadata,
key_value_metadata: [
{ key: 'key1', value: 'value1' },
{ key: 'key2', value: 'value2' },
],
metadata_length: 529,
}
writeMetadata(writer, withKvMetadata)
// write footer PAR1
writer.appendUint32(0x31524150)
const file = writer.getBuffer()
const outputMetadata = parquetMetadata(file)
expect(outputMetadata).toEqual(withKvMetadata)
})
})
describe('logicalType', () => {
it('returns undefined when given undefined', () => {
expect(logicalType(undefined)).toBeUndefined()
})
it('returns correct object for known types', () => {
/** @type {{ input: LogicalType, expected: ThriftObject }[]} */
const testCases = [
{ input: { type: 'STRING' }, expected: { field_1: {} } },
{ input: { type: 'MAP' }, expected: { field_2: {} } },
{ input: { type: 'LIST' }, expected: { field_3: {} } },
{ input: { type: 'ENUM' }, expected: { field_4: {} } },
{
input: { type: 'DECIMAL', scale: 2, precision: 5 },
expected: { field_5: { field_1: 2, field_2: 5 } },
},
{ input: { type: 'DATE' }, expected: { field_6: {} } },
{
input: { type: 'TIME', isAdjustedToUTC: true, unit: 'MILLIS' },
expected: { field_7: { field_1: true, field_2: { field_1: {} } } },
},
{
input: { type: 'TIMESTAMP', isAdjustedToUTC: false, unit: 'MICROS' },
expected: { field_8: { field_1: false, field_2: { field_2: {} } } },
},
{
input: { type: 'TIMESTAMP', isAdjustedToUTC: false, unit: 'NANOS' },
expected: { field_8: { field_1: false, field_2: { field_3: {} } } },
},
{
input: { type: 'INTEGER', bitWidth: 32, isSigned: true },
expected: { field_10: { field_1: 32, field_2: true } },
},
{ input: { type: 'NULL' }, expected: { field_11: {} } },
{ input: { type: 'JSON' }, expected: { field_12: {} } },
{ input: { type: 'BSON' }, expected: { field_13: {} } },
{ input: { type: 'UUID' }, expected: { field_14: {} } },
{ input: { type: 'FLOAT16' }, expected: { field_15: {} } },
{ input: { type: 'VARIANT' }, expected: { field_16: {} } },
{ input: { type: 'GEOMETRY' }, expected: { field_17: {} } },
{ input: { type: 'GEOGRAPHY' }, expected: { field_18: {} } },
]
testCases.forEach(({ input, expected }) => {
expect(logicalType(input)).toEqual(expected)
})
})
})