mirror of
https://github.com/asadbek064/hyparquet.git
synced 2026-01-06 19:26:36 +00:00
parquetMetadataAsync tests
This commit is contained in:
parent
be7f2a8c77
commit
c2b48ab2fe
@ -64,9 +64,6 @@ export function parquetMetadata(arrayBuffer) {
|
||||
// Metadata length is 4 bytes before the last PAR1
|
||||
const metadataLengthOffset = view.byteLength - 8
|
||||
const metadataLength = view.getUint32(metadataLengthOffset, true)
|
||||
if (metadataLength <= 0) {
|
||||
throw new Error(`parquet invalid metadata length ${metadataLength}`)
|
||||
}
|
||||
if (metadataLength > view.byteLength - 8) {
|
||||
// {metadata}, metadata_length, PAR1
|
||||
throw new Error(`parquet metadata length ${metadataLength} exceeds available buffer ${view.byteLength - 8}`)
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
import { promises as fs } from 'fs'
|
||||
import fs from 'fs'
|
||||
import { describe, expect, it } from 'vitest'
|
||||
import { parquetMetadata } from '../src/metadata.js'
|
||||
import { parquetMetadata, parquetMetadataAsync } from '../src/metadata.js'
|
||||
import { toJson } from '../src/toJson.js'
|
||||
|
||||
/**
|
||||
@ -10,156 +10,35 @@ import { toJson } from '../src/toJson.js'
|
||||
* @returns {Promise<ArrayBuffer>}
|
||||
*/
|
||||
async function readFileToArrayBuffer(filePath) {
|
||||
const buffer = await fs.readFile(filePath)
|
||||
const buffer = await fs.promises.readFile(filePath)
|
||||
return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)
|
||||
}
|
||||
|
||||
/**
|
||||
* Wrap .parquet file in an AsyncBuffer
|
||||
*
|
||||
* @typedef {import('../src/types.js').AsyncBuffer} AsyncBuffer
|
||||
* @param {string} filePath
|
||||
* @returns {AsyncBuffer}
|
||||
*/
|
||||
function fileToAsyncBuffer(filePath) {
|
||||
return {
|
||||
byteLength: fs.statSync(filePath).size,
|
||||
slice: async (start, end) => (await readFileToArrayBuffer(filePath)).slice(start, end),
|
||||
}
|
||||
}
|
||||
|
||||
describe('parquetMetadata', () => {
|
||||
it('should correctly decode metadata from addrtype-missing-value.parquet', async () => {
|
||||
const arrayBuffer = await readFileToArrayBuffer('test/files/addrtype-missing-value.parquet')
|
||||
const result = parquetMetadata(arrayBuffer)
|
||||
|
||||
// Parquet v1 from DuckDB
|
||||
const expectedMetadata = {
|
||||
version: 1,
|
||||
created_by: 'DuckDB',
|
||||
metadata_length: 149,
|
||||
schema: [
|
||||
{ repetition_type: 0, name: 'duckdb_schema', num_children: 1 },
|
||||
{ type: 6, repetition_type: 1, name: 'ADDRTYPE', converted_type: 0 },
|
||||
],
|
||||
num_rows: 10,
|
||||
row_groups: [
|
||||
{
|
||||
columns: [
|
||||
{
|
||||
file_offset: 0,
|
||||
meta_data: {
|
||||
type: 6,
|
||||
encodings: [0, 8],
|
||||
path_in_schema: ['ADDRTYPE'],
|
||||
codec: 1,
|
||||
num_values: 10,
|
||||
total_uncompressed_size: 78,
|
||||
total_compressed_size: 82,
|
||||
data_page_offset: 31,
|
||||
dictionary_page_offset: 4,
|
||||
statistics: {
|
||||
max: 'Intersection',
|
||||
min: 'Block',
|
||||
null_count: 1,
|
||||
distinct_count: 2,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
total_byte_size: 33024,
|
||||
num_rows: 10,
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
const casted = toJson(result)
|
||||
expect(casted).toEqual(expectedMetadata)
|
||||
expect(toJson(result)).toEqual(addrtypeMetadata)
|
||||
})
|
||||
|
||||
it('should correctly decode metadata from rowgroups.parquet', async () => {
|
||||
const arrayBuffer = await readFileToArrayBuffer('test/files/rowgroups.parquet')
|
||||
const result = parquetMetadata(arrayBuffer)
|
||||
|
||||
// Parquet v2 from pandas with 2 row groups
|
||||
const expectedMetadata = {
|
||||
version: 2,
|
||||
created_by: 'parquet-cpp-arrow version 14.0.2',
|
||||
metadata_length: 1602,
|
||||
schema: [
|
||||
{
|
||||
repetition_type: 0,
|
||||
name: 'schema',
|
||||
num_children: 1,
|
||||
},
|
||||
{
|
||||
type: 2,
|
||||
repetition_type: 1,
|
||||
name: 'numbers',
|
||||
},
|
||||
],
|
||||
num_rows: 15,
|
||||
row_groups: [
|
||||
{
|
||||
columns: [
|
||||
{
|
||||
file_offset: 150,
|
||||
file_path: undefined,
|
||||
meta_data: {
|
||||
codec: 1,
|
||||
data_page_offset: 71,
|
||||
dictionary_page_offset: 4,
|
||||
encoding_stats: [
|
||||
{ count: 1, encoding: 0, page_type: 2 },
|
||||
{ count: 1, encoding: 8, page_type: 0 },
|
||||
],
|
||||
encodings: [0, 3, 8],
|
||||
num_values: 10,
|
||||
path_in_schema: ['numbers'],
|
||||
statistics: {
|
||||
max: '\n\x00\x00\x00\x00\x00\x00\x00',
|
||||
min: '\x01\x00\x00\x00\x00\x00\x00\x00',
|
||||
null_count: 0,
|
||||
},
|
||||
total_compressed_size: 146,
|
||||
total_uncompressed_size: 172,
|
||||
type: 2,
|
||||
},
|
||||
},
|
||||
],
|
||||
total_byte_size: 172,
|
||||
num_rows: 10,
|
||||
},
|
||||
{
|
||||
columns: [
|
||||
{
|
||||
file_offset: 368,
|
||||
meta_data: {
|
||||
codec: 1,
|
||||
data_page_offset: 294,
|
||||
dictionary_page_offset: 248,
|
||||
encoding_stats: [
|
||||
{ count: 1, encoding: 0, page_type: 2 },
|
||||
{ count: 1, encoding: 8, page_type: 0 },
|
||||
],
|
||||
encodings: [0, 3, 8],
|
||||
num_values: 5,
|
||||
path_in_schema: ['numbers'],
|
||||
statistics: {
|
||||
max: '\x0F\x00\x00\x00\x00\x00\x00\x00',
|
||||
min: '\x0B\x00\x00\x00\x00\x00\x00\x00',
|
||||
null_count: 0,
|
||||
},
|
||||
total_compressed_size: 120,
|
||||
total_uncompressed_size: 126,
|
||||
type: 2,
|
||||
},
|
||||
},
|
||||
],
|
||||
total_byte_size: 126,
|
||||
num_rows: 5,
|
||||
},
|
||||
],
|
||||
key_value_metadata: [
|
||||
{
|
||||
key: 'pandas',
|
||||
// value: json
|
||||
},
|
||||
{
|
||||
key: 'ARROW:schema',
|
||||
// value: base64
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
const casted = toJson(result)
|
||||
expect(casted).containSubset(expectedMetadata)
|
||||
expect(toJson(result)).containSubset(rowgroupsMetadata)
|
||||
})
|
||||
|
||||
it('should throw an error for a too short file', () => {
|
||||
@ -167,8 +46,164 @@ describe('parquetMetadata', () => {
|
||||
expect(() => parquetMetadata(arrayBuffer)).toThrow('parquet file is too short')
|
||||
})
|
||||
|
||||
it('should throw an error for invalid metadata length', () => {
|
||||
const arrayBuffer = new ArrayBuffer(12)
|
||||
const view = new DataView(arrayBuffer)
|
||||
view.setUint32(0, 0x31524150, true) // magic number PAR1
|
||||
view.setUint32(4, 1000, true) // 1000 bytes exceeds buffer
|
||||
view.setUint32(8, 0x31524150, true) // magic number PAR1
|
||||
expect(() => parquetMetadata(arrayBuffer))
|
||||
.toThrow('parquet metadata length 1000 exceeds available buffer 4')
|
||||
})
|
||||
|
||||
it('should throw an error for invalid magic number', () => {
|
||||
const arrayBuffer = new ArrayBuffer(8)
|
||||
expect(() => parquetMetadata(arrayBuffer)).toThrow('parquet file invalid magic number')
|
||||
})
|
||||
})
|
||||
|
||||
describe('parquetMetadataAsync', () => {
|
||||
it('should correctly decode metadata from addrtype-missing-value.parquet', async () => {
|
||||
const asyncBuffer = fileToAsyncBuffer('test/files/addrtype-missing-value.parquet')
|
||||
const result = await parquetMetadataAsync(asyncBuffer)
|
||||
expect(toJson(result)).toEqual(addrtypeMetadata)
|
||||
})
|
||||
|
||||
it('should correctly decode metadata from rowgroups.parquet', async () => {
|
||||
const asyncBuffer = fileToAsyncBuffer('test/files/rowgroups.parquet')
|
||||
// force two fetches
|
||||
const result = await parquetMetadataAsync(asyncBuffer, 1609)
|
||||
expect(toJson(result)).containSubset(rowgroupsMetadata)
|
||||
})
|
||||
})
|
||||
|
||||
// Parquet v1 from DuckDB
|
||||
const addrtypeMetadata = {
|
||||
version: 1,
|
||||
created_by: 'DuckDB',
|
||||
metadata_length: 149,
|
||||
schema: [
|
||||
{ repetition_type: 0, name: 'duckdb_schema', num_children: 1 },
|
||||
{ type: 6, repetition_type: 1, name: 'ADDRTYPE', converted_type: 0 },
|
||||
],
|
||||
num_rows: 10,
|
||||
row_groups: [
|
||||
{
|
||||
columns: [
|
||||
{
|
||||
file_offset: 0,
|
||||
meta_data: {
|
||||
type: 6,
|
||||
encodings: [0, 8],
|
||||
path_in_schema: ['ADDRTYPE'],
|
||||
codec: 1,
|
||||
num_values: 10,
|
||||
total_uncompressed_size: 78,
|
||||
total_compressed_size: 82,
|
||||
data_page_offset: 31,
|
||||
dictionary_page_offset: 4,
|
||||
statistics: {
|
||||
max: 'Intersection',
|
||||
min: 'Block',
|
||||
null_count: 1,
|
||||
distinct_count: 2,
|
||||
},
|
||||
},
|
||||
},
|
||||
],
|
||||
total_byte_size: 33024,
|
||||
num_rows: 10,
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
// Parquet v2 from pandas with 2 row groups
|
||||
const rowgroupsMetadata = {
|
||||
version: 2,
|
||||
created_by: 'parquet-cpp-arrow version 14.0.2',
|
||||
metadata_length: 1602,
|
||||
schema: [
|
||||
{
|
||||
repetition_type: 0,
|
||||
name: 'schema',
|
||||
num_children: 1,
|
||||
},
|
||||
{
|
||||
type: 2,
|
||||
repetition_type: 1,
|
||||
name: 'numbers',
|
||||
},
|
||||
],
|
||||
num_rows: 15,
|
||||
row_groups: [
|
||||
{
|
||||
columns: [
|
||||
{
|
||||
file_offset: 150,
|
||||
file_path: undefined,
|
||||
meta_data: {
|
||||
codec: 1,
|
||||
data_page_offset: 71,
|
||||
dictionary_page_offset: 4,
|
||||
encoding_stats: [
|
||||
{ count: 1, encoding: 0, page_type: 2 },
|
||||
{ count: 1, encoding: 8, page_type: 0 },
|
||||
],
|
||||
encodings: [0, 3, 8],
|
||||
num_values: 10,
|
||||
path_in_schema: ['numbers'],
|
||||
statistics: {
|
||||
max: '\n\x00\x00\x00\x00\x00\x00\x00',
|
||||
min: '\x01\x00\x00\x00\x00\x00\x00\x00',
|
||||
null_count: 0,
|
||||
},
|
||||
total_compressed_size: 146,
|
||||
total_uncompressed_size: 172,
|
||||
type: 2,
|
||||
},
|
||||
},
|
||||
],
|
||||
total_byte_size: 172,
|
||||
num_rows: 10,
|
||||
},
|
||||
{
|
||||
columns: [
|
||||
{
|
||||
file_offset: 368,
|
||||
meta_data: {
|
||||
codec: 1,
|
||||
data_page_offset: 294,
|
||||
dictionary_page_offset: 248,
|
||||
encoding_stats: [
|
||||
{ count: 1, encoding: 0, page_type: 2 },
|
||||
{ count: 1, encoding: 8, page_type: 0 },
|
||||
],
|
||||
encodings: [0, 3, 8],
|
||||
num_values: 5,
|
||||
path_in_schema: ['numbers'],
|
||||
statistics: {
|
||||
max: '\x0F\x00\x00\x00\x00\x00\x00\x00',
|
||||
min: '\x0B\x00\x00\x00\x00\x00\x00\x00',
|
||||
null_count: 0,
|
||||
},
|
||||
total_compressed_size: 120,
|
||||
total_uncompressed_size: 126,
|
||||
type: 2,
|
||||
},
|
||||
},
|
||||
],
|
||||
total_byte_size: 126,
|
||||
num_rows: 5,
|
||||
},
|
||||
],
|
||||
key_value_metadata: [
|
||||
{
|
||||
key: 'pandas',
|
||||
// value: json
|
||||
},
|
||||
{
|
||||
key: 'ARROW:schema',
|
||||
// value: base64
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user