mirror of
https://github.com/asadbek064/hyparquet-writer.git
synced 2025-12-05 23:31:54 +00:00
177 lines
5.6 KiB
JavaScript
177 lines
5.6 KiB
JavaScript
import { parquetMetadata } from 'hyparquet'
|
|
import { describe, expect, it } from 'vitest'
|
|
import { ByteWriter } from '../src/bytewriter.js'
|
|
import { logicalType, writeMetadata } from '../src/metadata.js'
|
|
import { exampleMetadata } from './example.js'
|
|
|
|
/**
|
|
* @import {FileMetaData, LogicalType} from 'hyparquet'
|
|
* @import {ThriftObject} from '../src/types.js'
|
|
*/
|
|
|
|
describe('writeMetadata', () => {
|
|
it('writes metadata and parses in hyparquet', () => {
|
|
const writer = new ByteWriter()
|
|
|
|
// write header PAR1
|
|
writer.appendUint32(0x31524150)
|
|
|
|
// write metadata
|
|
/** @type {FileMetaData} */
|
|
const withKvMetadata = {
|
|
...exampleMetadata,
|
|
key_value_metadata: [
|
|
{ key: 'key1', value: 'value1' },
|
|
{ key: 'key2', value: 'value2' },
|
|
],
|
|
metadata_length: 477,
|
|
}
|
|
writeMetadata(writer, withKvMetadata)
|
|
|
|
// write footer PAR1
|
|
writer.appendUint32(0x31524150)
|
|
|
|
const file = writer.getBuffer()
|
|
const outputMetadata = parquetMetadata(file)
|
|
|
|
expect(outputMetadata).toEqual(withKvMetadata)
|
|
})
|
|
|
|
it('writes extended column metadata fields', () => {
|
|
const writer = new ByteWriter()
|
|
writer.appendUint32(0x31524150)
|
|
|
|
/** @type {FileMetaData} */
|
|
const extendedMetadata = {
|
|
version: 2,
|
|
created_by: 'hyparquet',
|
|
schema: [
|
|
{ name: 'root', num_children: 1 },
|
|
{
|
|
name: 'geo',
|
|
type: 'BYTE_ARRAY',
|
|
repetition_type: 'REQUIRED',
|
|
logical_type: { type: 'GEOGRAPHY', crs: 'EPSG:4326', algorithm: 'KARNEY' },
|
|
},
|
|
],
|
|
num_rows: 1n,
|
|
row_groups: [{
|
|
columns: [{
|
|
file_path: 'part-0.parquet',
|
|
file_offset: 4n,
|
|
meta_data: {
|
|
type: 'BYTE_ARRAY',
|
|
encodings: ['PLAIN', 'RLE'],
|
|
path_in_schema: [],
|
|
codec: 'SNAPPY',
|
|
num_values: 1n,
|
|
total_uncompressed_size: 10n,
|
|
total_compressed_size: 8n,
|
|
key_value_metadata: [{ key: 'chunk', value: 'value' }],
|
|
data_page_offset: 4n,
|
|
index_page_offset: 12n,
|
|
dictionary_page_offset: 20n,
|
|
statistics: {
|
|
null_count: 0n,
|
|
min_value: 'a',
|
|
max_value: 'z',
|
|
},
|
|
encoding_stats: [{ page_type: 'DATA_PAGE', encoding: 'PLAIN', count: 1 }],
|
|
bloom_filter_offset: 30n,
|
|
bloom_filter_length: 4,
|
|
size_statistics: {
|
|
unencoded_byte_array_data_bytes: 5n,
|
|
repetition_level_histogram: [1n, 0n],
|
|
definition_level_histogram: [2n, 0n],
|
|
},
|
|
geospatial_statistics: {
|
|
bbox: {
|
|
xmin: 0,
|
|
xmax: 10,
|
|
ymin: -5,
|
|
ymax: 5,
|
|
zmin: 1,
|
|
zmax: 2,
|
|
mmin: 3,
|
|
mmax: 4,
|
|
},
|
|
geospatial_types: [0, 1],
|
|
},
|
|
},
|
|
offset_index_offset: 40n,
|
|
offset_index_length: 16,
|
|
column_index_offset: 60n,
|
|
column_index_length: 24,
|
|
encrypted_column_metadata: new Uint8Array([7, 8, 9]),
|
|
}],
|
|
total_byte_size: 64n,
|
|
num_rows: 1n,
|
|
sorting_columns: [{
|
|
column_idx: 0,
|
|
descending: true,
|
|
nulls_first: false,
|
|
}],
|
|
file_offset: 4n,
|
|
total_compressed_size: 8n,
|
|
}],
|
|
key_value_metadata: [{ key: 'meta', value: 'data' }],
|
|
metadata_length: 223,
|
|
}
|
|
|
|
writeMetadata(writer, extendedMetadata)
|
|
writer.appendUint32(0x31524150)
|
|
|
|
const outputMetadata = parquetMetadata(writer.getBuffer())
|
|
expect(outputMetadata).toEqual(extendedMetadata)
|
|
})
|
|
})
|
|
|
|
describe('logicalType', () => {
|
|
it('returns undefined when given undefined', () => {
|
|
expect(logicalType(undefined)).toBeUndefined()
|
|
})
|
|
|
|
it('returns correct object for known types', () => {
|
|
/** @type {{ input: LogicalType, expected: ThriftObject }[]} */
|
|
const testCases = [
|
|
{ input: { type: 'STRING' }, expected: { field_1: {} } },
|
|
{ input: { type: 'MAP' }, expected: { field_2: {} } },
|
|
{ input: { type: 'LIST' }, expected: { field_3: {} } },
|
|
{ input: { type: 'ENUM' }, expected: { field_4: {} } },
|
|
{
|
|
input: { type: 'DECIMAL', scale: 2, precision: 5 },
|
|
expected: { field_5: { field_1: 2, field_2: 5 } },
|
|
},
|
|
{ input: { type: 'DATE' }, expected: { field_6: {} } },
|
|
{
|
|
input: { type: 'TIME', isAdjustedToUTC: true, unit: 'MILLIS' },
|
|
expected: { field_7: { field_1: true, field_2: { field_1: {} } } },
|
|
},
|
|
{
|
|
input: { type: 'TIMESTAMP', isAdjustedToUTC: false, unit: 'MICROS' },
|
|
expected: { field_8: { field_1: false, field_2: { field_2: {} } } },
|
|
},
|
|
{
|
|
input: { type: 'TIMESTAMP', isAdjustedToUTC: false, unit: 'NANOS' },
|
|
expected: { field_8: { field_1: false, field_2: { field_3: {} } } },
|
|
},
|
|
{
|
|
input: { type: 'INTEGER', bitWidth: 32, isSigned: true },
|
|
expected: { field_10: { field_1: 32, field_2: true } },
|
|
},
|
|
{ input: { type: 'NULL' }, expected: { field_11: {} } },
|
|
{ input: { type: 'JSON' }, expected: { field_12: {} } },
|
|
{ input: { type: 'BSON' }, expected: { field_13: {} } },
|
|
{ input: { type: 'UUID' }, expected: { field_14: {} } },
|
|
{ input: { type: 'FLOAT16' }, expected: { field_15: {} } },
|
|
{ input: { type: 'VARIANT' }, expected: { field_16: {} } },
|
|
{ input: { type: 'GEOMETRY' }, expected: { field_17: {} } },
|
|
{ input: { type: 'GEOGRAPHY' }, expected: { field_18: {} } },
|
|
]
|
|
|
|
testCases.forEach(({ input, expected }) => {
|
|
expect(logicalType(input)).toEqual(expected)
|
|
})
|
|
})
|
|
})
|