diff --git a/src/metadata.js b/src/metadata.js index 07d8410..a969805 100644 --- a/src/metadata.js +++ b/src/metadata.js @@ -26,6 +26,9 @@ export function parquetMetadata(arrayBuffer) { if (metadataLength <= 0 || metadataLength > metadataLengthOffset) { throw new Error('parquet file invalid metadata length') } + if (metadataLength > view.byteLength - 8) { + throw new Error('parquet file metadata length exceeds file size') + } const metadataOffset = metadataLengthOffset - metadataLength const metadataBuffer = view.buffer.slice(metadataOffset, metadataLengthOffset) diff --git a/test/files/rowgroups.parquet b/test/files/rowgroups.parquet new file mode 100644 index 0000000..72ad90d Binary files /dev/null and b/test/files/rowgroups.parquet differ diff --git a/test/metadata.test.js b/test/metadata.test.js index 3ddc485..6406f44 100644 --- a/test/metadata.test.js +++ b/test/metadata.test.js @@ -19,6 +19,7 @@ describe('parquetMetadata', () => { const arrayBuffer = await readFileToArrayBuffer('test/files/addrtype-missing-value.parquet') const result = parquetMetadata(arrayBuffer) + // Parquet v1 from DuckDB const expectedMetadata = { version: 1, schema: [ @@ -61,6 +62,104 @@ describe('parquetMetadata', () => { expect(casted).toEqual(expectedMetadata) }) + it('should correctly decode metadata from rowgroups.parquet', async () => { + const arrayBuffer = await readFileToArrayBuffer('test/files/rowgroups.parquet') + const result = parquetMetadata(arrayBuffer) + + // Parquet v2 from pandas with 2 row groups + const expectedMetadata = { + version: 2, + schema: [ + { + repetition_type: 0, + name: 'schema', + num_children: 1, + }, + { + type: 2, + repetition_type: 1, + name: 'numbers', + }, + ], + num_rows: 15, + row_groups: [ + { + columns: [ + { + file_offset: 150, + file_path: undefined, + meta_data: { + codec: 1, + data_page_offset: 71, + dictionary_page_offset: 4, + encoding_stats: [ + { count: 1, encoding: 0, page_type: 2 }, + { count: 1, encoding: 8, page_type: 0 }, + ], + encodings: [0, 3, 8], + num_values: 10, + path_in_schema: ['numbers'], + statistics: { + max: '\n\x00\x00\x00\x00\x00\x00\x00', + min: '\x01\x00\x00\x00\x00\x00\x00\x00', + null_count: 0, + }, + total_compressed_size: 146, + total_uncompressed_size: 172, + type: 2, + }, + }, + ], + total_byte_size: 172, + num_rows: 10, + }, + { + columns: [ + { + file_offset: 368, + meta_data: { + codec: 1, + data_page_offset: 294, + dictionary_page_offset: 248, + encoding_stats: [ + { count: 1, encoding: 0, page_type: 2 }, + { count: 1, encoding: 8, page_type: 0 }, + ], + encodings: [0, 3, 8], + num_values: 5, + path_in_schema: ['numbers'], + statistics: { + max: '\x0F\x00\x00\x00\x00\x00\x00\x00', + min: '\x0B\x00\x00\x00\x00\x00\x00\x00', + null_count: 0, + }, + total_compressed_size: 120, + total_uncompressed_size: 126, + type: 2, + }, + }, + ], + total_byte_size: 126, + num_rows: 5, + }, + ], + key_value_metadata: [ + { + key: 'pandas', + // value: json + }, + { + key: 'ARROW:schema', + // value: base64 + }, + ], + created_by: 'parquet-cpp-arrow version 14.0.2', + } + + const casted = toJson(result) + expect(casted).containSubset(expectedMetadata) + }) + it('should throw an error for a too short file', () => { const arrayBuffer = new ArrayBuffer(0) expect(() => parquetMetadata(arrayBuffer)).toThrow('parquet file is too short')