Metadata test rowgroups.parquet

2026-03-31 11:08:02 +00:00 · 2024-01-11 11:06:37 -08:00 · 2024-01-11 11:06:37 -08:00 · 62632d9333
commit 62632d9333
parent f2a15bd74f
3 changed files with 102 additions and 0 deletions
--- a/src/metadata.js
+++ b/src/metadata.js
@ -26,6 +26,9 @@ export function parquetMetadata(arrayBuffer) {
  if (metadataLength <= 0 || metadataLength > metadataLengthOffset) {
    throw new Error('parquet file invalid metadata length')
  }
+  if (metadataLength > view.byteLength - 8) {
+    throw new Error('parquet file metadata length exceeds file size')
+  }

  const metadataOffset = metadataLengthOffset - metadataLength
  const metadataBuffer = view.buffer.slice(metadataOffset, metadataLengthOffset)
--- a/test/files/rowgroups.parquet
+++ b/test/files/rowgroups.parquet
--- a/test/metadata.test.js
+++ b/test/metadata.test.js
@ -19,6 +19,7 @@ describe('parquetMetadata', () => {
    const arrayBuffer = await readFileToArrayBuffer('test/files/addrtype-missing-value.parquet')
    const result = parquetMetadata(arrayBuffer)

+    // Parquet v1 from DuckDB
    const expectedMetadata = {
      version: 1,
      schema: [
@ -61,6 +62,104 @@ describe('parquetMetadata', () => {
    expect(casted).toEqual(expectedMetadata)
  })

+  it('should correctly decode metadata from rowgroups.parquet', async () => {
+    const arrayBuffer = await readFileToArrayBuffer('test/files/rowgroups.parquet')
+    const result = parquetMetadata(arrayBuffer)
+
+    // Parquet v2 from pandas with 2 row groups
+    const expectedMetadata = {
+      version: 2,
+      schema: [
+        {
+          repetition_type: 0,
+          name: 'schema',
+          num_children: 1,
+        },
+        {
+          type: 2,
+          repetition_type: 1,
+          name: 'numbers',
+        },
+      ],
+      num_rows: 15,
+      row_groups: [
+        {
+          columns: [
+            {
+              file_offset: 150,
+              file_path: undefined,
+              meta_data: {
+                codec: 1,
+                data_page_offset: 71,
+                dictionary_page_offset: 4,
+                encoding_stats: [
+                  { count: 1, encoding: 0, page_type: 2 },
+                  { count: 1, encoding: 8, page_type: 0 },
+                ],
+                encodings: [0, 3, 8],
+                num_values: 10,
+                path_in_schema: ['numbers'],
+                statistics: {
+                  max: '\n\x00\x00\x00\x00\x00\x00\x00',
+                  min: '\x01\x00\x00\x00\x00\x00\x00\x00',
+                  null_count: 0,
+                },
+                total_compressed_size: 146,
+                total_uncompressed_size: 172,
+                type: 2,
+              },
+            },
+          ],
+          total_byte_size: 172,
+          num_rows: 10,
+        },
+        {
+          columns: [
+            {
+              file_offset: 368,
+              meta_data: {
+                codec: 1,
+                data_page_offset: 294,
+                dictionary_page_offset: 248,
+                encoding_stats: [
+                  { count: 1, encoding: 0, page_type: 2 },
+                  { count: 1, encoding: 8, page_type: 0 },
+                ],
+                encodings: [0, 3, 8],
+                num_values: 5,
+                path_in_schema: ['numbers'],
+                statistics: {
+                  max: '\x0F\x00\x00\x00\x00\x00\x00\x00',
+                  min: '\x0B\x00\x00\x00\x00\x00\x00\x00',
+                  null_count: 0,
+                },
+                total_compressed_size: 120,
+                total_uncompressed_size: 126,
+                type: 2,
+              },
+            },
+          ],
+          total_byte_size: 126,
+          num_rows: 5,
+        },
+      ],
+      key_value_metadata: [
+        {
+          key: 'pandas',
+          // value: json
+        },
+        {
+          key: 'ARROW:schema',
+          // value: base64
+        },
+      ],
+      created_by: 'parquet-cpp-arrow version 14.0.2',
+    }
+
+    const casted = toJson(result)
+    expect(casted).containSubset(expectedMetadata)
+  })
+
  it('should throw an error for a too short file', () => {
    const arrayBuffer = new ArrayBuffer(0)
    expect(() => parquetMetadata(arrayBuffer)).toThrow('parquet file is too short')