From c2b48ab2fe97315fdb40d2827d1aaf7ead7604be Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Mon, 15 Jan 2024 13:40:12 -0800 Subject: [PATCH] parquetMetadataAsync tests --- src/metadata.js | 3 - test/metadata.test.js | 315 +++++++++++++++++++++++------------------- 2 files changed, 175 insertions(+), 143 deletions(-) diff --git a/src/metadata.js b/src/metadata.js index a4374ef..336791b 100644 --- a/src/metadata.js +++ b/src/metadata.js @@ -64,9 +64,6 @@ export function parquetMetadata(arrayBuffer) { // Metadata length is 4 bytes before the last PAR1 const metadataLengthOffset = view.byteLength - 8 const metadataLength = view.getUint32(metadataLengthOffset, true) - if (metadataLength <= 0) { - throw new Error(`parquet invalid metadata length ${metadataLength}`) - } if (metadataLength > view.byteLength - 8) { // {metadata}, metadata_length, PAR1 throw new Error(`parquet metadata length ${metadataLength} exceeds available buffer ${view.byteLength - 8}`) diff --git a/test/metadata.test.js b/test/metadata.test.js index 1a8746b..ff0bd9c 100644 --- a/test/metadata.test.js +++ b/test/metadata.test.js @@ -1,6 +1,6 @@ -import { promises as fs } from 'fs' +import fs from 'fs' import { describe, expect, it } from 'vitest' -import { parquetMetadata } from '../src/metadata.js' +import { parquetMetadata, parquetMetadataAsync } from '../src/metadata.js' import { toJson } from '../src/toJson.js' /** @@ -10,156 +10,35 @@ import { toJson } from '../src/toJson.js' * @returns {Promise} */ async function readFileToArrayBuffer(filePath) { - const buffer = await fs.readFile(filePath) + const buffer = await fs.promises.readFile(filePath) return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength) } +/** + * Wrap .parquet file in an AsyncBuffer + * + * @typedef {import('../src/types.js').AsyncBuffer} AsyncBuffer + * @param {string} filePath + * @returns {AsyncBuffer} + */ +function fileToAsyncBuffer(filePath) { + return { + byteLength: fs.statSync(filePath).size, + slice: async (start, end) => (await readFileToArrayBuffer(filePath)).slice(start, end), + } +} + describe('parquetMetadata', () => { it('should correctly decode metadata from addrtype-missing-value.parquet', async () => { const arrayBuffer = await readFileToArrayBuffer('test/files/addrtype-missing-value.parquet') const result = parquetMetadata(arrayBuffer) - - // Parquet v1 from DuckDB - const expectedMetadata = { - version: 1, - created_by: 'DuckDB', - metadata_length: 149, - schema: [ - { repetition_type: 0, name: 'duckdb_schema', num_children: 1 }, - { type: 6, repetition_type: 1, name: 'ADDRTYPE', converted_type: 0 }, - ], - num_rows: 10, - row_groups: [ - { - columns: [ - { - file_offset: 0, - meta_data: { - type: 6, - encodings: [0, 8], - path_in_schema: ['ADDRTYPE'], - codec: 1, - num_values: 10, - total_uncompressed_size: 78, - total_compressed_size: 82, - data_page_offset: 31, - dictionary_page_offset: 4, - statistics: { - max: 'Intersection', - min: 'Block', - null_count: 1, - distinct_count: 2, - }, - }, - }, - ], - total_byte_size: 33024, - num_rows: 10, - }, - ], - } - - const casted = toJson(result) - expect(casted).toEqual(expectedMetadata) + expect(toJson(result)).toEqual(addrtypeMetadata) }) it('should correctly decode metadata from rowgroups.parquet', async () => { const arrayBuffer = await readFileToArrayBuffer('test/files/rowgroups.parquet') const result = parquetMetadata(arrayBuffer) - - // Parquet v2 from pandas with 2 row groups - const expectedMetadata = { - version: 2, - created_by: 'parquet-cpp-arrow version 14.0.2', - metadata_length: 1602, - schema: [ - { - repetition_type: 0, - name: 'schema', - num_children: 1, - }, - { - type: 2, - repetition_type: 1, - name: 'numbers', - }, - ], - num_rows: 15, - row_groups: [ - { - columns: [ - { - file_offset: 150, - file_path: undefined, - meta_data: { - codec: 1, - data_page_offset: 71, - dictionary_page_offset: 4, - encoding_stats: [ - { count: 1, encoding: 0, page_type: 2 }, - { count: 1, encoding: 8, page_type: 0 }, - ], - encodings: [0, 3, 8], - num_values: 10, - path_in_schema: ['numbers'], - statistics: { - max: '\n\x00\x00\x00\x00\x00\x00\x00', - min: '\x01\x00\x00\x00\x00\x00\x00\x00', - null_count: 0, - }, - total_compressed_size: 146, - total_uncompressed_size: 172, - type: 2, - }, - }, - ], - total_byte_size: 172, - num_rows: 10, - }, - { - columns: [ - { - file_offset: 368, - meta_data: { - codec: 1, - data_page_offset: 294, - dictionary_page_offset: 248, - encoding_stats: [ - { count: 1, encoding: 0, page_type: 2 }, - { count: 1, encoding: 8, page_type: 0 }, - ], - encodings: [0, 3, 8], - num_values: 5, - path_in_schema: ['numbers'], - statistics: { - max: '\x0F\x00\x00\x00\x00\x00\x00\x00', - min: '\x0B\x00\x00\x00\x00\x00\x00\x00', - null_count: 0, - }, - total_compressed_size: 120, - total_uncompressed_size: 126, - type: 2, - }, - }, - ], - total_byte_size: 126, - num_rows: 5, - }, - ], - key_value_metadata: [ - { - key: 'pandas', - // value: json - }, - { - key: 'ARROW:schema', - // value: base64 - }, - ], - } - - const casted = toJson(result) - expect(casted).containSubset(expectedMetadata) + expect(toJson(result)).containSubset(rowgroupsMetadata) }) it('should throw an error for a too short file', () => { @@ -167,8 +46,164 @@ describe('parquetMetadata', () => { expect(() => parquetMetadata(arrayBuffer)).toThrow('parquet file is too short') }) + it('should throw an error for invalid metadata length', () => { + const arrayBuffer = new ArrayBuffer(12) + const view = new DataView(arrayBuffer) + view.setUint32(0, 0x31524150, true) // magic number PAR1 + view.setUint32(4, 1000, true) // 1000 bytes exceeds buffer + view.setUint32(8, 0x31524150, true) // magic number PAR1 + expect(() => parquetMetadata(arrayBuffer)) + .toThrow('parquet metadata length 1000 exceeds available buffer 4') + }) + it('should throw an error for invalid magic number', () => { const arrayBuffer = new ArrayBuffer(8) expect(() => parquetMetadata(arrayBuffer)).toThrow('parquet file invalid magic number') }) }) + +describe('parquetMetadataAsync', () => { + it('should correctly decode metadata from addrtype-missing-value.parquet', async () => { + const asyncBuffer = fileToAsyncBuffer('test/files/addrtype-missing-value.parquet') + const result = await parquetMetadataAsync(asyncBuffer) + expect(toJson(result)).toEqual(addrtypeMetadata) + }) + + it('should correctly decode metadata from rowgroups.parquet', async () => { + const asyncBuffer = fileToAsyncBuffer('test/files/rowgroups.parquet') + // force two fetches + const result = await parquetMetadataAsync(asyncBuffer, 1609) + expect(toJson(result)).containSubset(rowgroupsMetadata) + }) +}) + +// Parquet v1 from DuckDB +const addrtypeMetadata = { + version: 1, + created_by: 'DuckDB', + metadata_length: 149, + schema: [ + { repetition_type: 0, name: 'duckdb_schema', num_children: 1 }, + { type: 6, repetition_type: 1, name: 'ADDRTYPE', converted_type: 0 }, + ], + num_rows: 10, + row_groups: [ + { + columns: [ + { + file_offset: 0, + meta_data: { + type: 6, + encodings: [0, 8], + path_in_schema: ['ADDRTYPE'], + codec: 1, + num_values: 10, + total_uncompressed_size: 78, + total_compressed_size: 82, + data_page_offset: 31, + dictionary_page_offset: 4, + statistics: { + max: 'Intersection', + min: 'Block', + null_count: 1, + distinct_count: 2, + }, + }, + }, + ], + total_byte_size: 33024, + num_rows: 10, + }, + ], +} + +// Parquet v2 from pandas with 2 row groups +const rowgroupsMetadata = { + version: 2, + created_by: 'parquet-cpp-arrow version 14.0.2', + metadata_length: 1602, + schema: [ + { + repetition_type: 0, + name: 'schema', + num_children: 1, + }, + { + type: 2, + repetition_type: 1, + name: 'numbers', + }, + ], + num_rows: 15, + row_groups: [ + { + columns: [ + { + file_offset: 150, + file_path: undefined, + meta_data: { + codec: 1, + data_page_offset: 71, + dictionary_page_offset: 4, + encoding_stats: [ + { count: 1, encoding: 0, page_type: 2 }, + { count: 1, encoding: 8, page_type: 0 }, + ], + encodings: [0, 3, 8], + num_values: 10, + path_in_schema: ['numbers'], + statistics: { + max: '\n\x00\x00\x00\x00\x00\x00\x00', + min: '\x01\x00\x00\x00\x00\x00\x00\x00', + null_count: 0, + }, + total_compressed_size: 146, + total_uncompressed_size: 172, + type: 2, + }, + }, + ], + total_byte_size: 172, + num_rows: 10, + }, + { + columns: [ + { + file_offset: 368, + meta_data: { + codec: 1, + data_page_offset: 294, + dictionary_page_offset: 248, + encoding_stats: [ + { count: 1, encoding: 0, page_type: 2 }, + { count: 1, encoding: 8, page_type: 0 }, + ], + encodings: [0, 3, 8], + num_values: 5, + path_in_schema: ['numbers'], + statistics: { + max: '\x0F\x00\x00\x00\x00\x00\x00\x00', + min: '\x0B\x00\x00\x00\x00\x00\x00\x00', + null_count: 0, + }, + total_compressed_size: 120, + total_uncompressed_size: 126, + type: 2, + }, + }, + ], + total_byte_size: 126, + num_rows: 5, + }, + ], + key_value_metadata: [ + { + key: 'pandas', + // value: json + }, + { + key: 'ARROW:schema', + // value: base64 + }, + ], +}