parquetMetadataAsync tests

2026-02-23 12:51:32 +00:00 · 2024-01-15 13:40:12 -08:00 · 2024-01-15 13:40:12 -08:00 · c2b48ab2fe
commit c2b48ab2fe
parent be7f2a8c77
2 changed files with 175 additions and 143 deletions
--- a/src/metadata.js
+++ b/src/metadata.js
@ -64,9 +64,6 @@ export function parquetMetadata(arrayBuffer) {
  // Metadata length is 4 bytes before the last PAR1
  const metadataLengthOffset = view.byteLength - 8
  const metadataLength = view.getUint32(metadataLengthOffset, true)
-  if (metadataLength <= 0) {
-    throw new Error(`parquet invalid metadata length ${metadataLength}`)
-  }
  if (metadataLength > view.byteLength - 8) {
    // {metadata}, metadata_length, PAR1
    throw new Error(`parquet metadata length ${metadataLength} exceeds available buffer ${view.byteLength - 8}`)
--- a/test/metadata.test.js
+++ b/test/metadata.test.js
@ -1,6 +1,6 @@
-import { promises as fs } from 'fs'
+import fs from 'fs'
 import { describe, expect, it } from 'vitest'
-import { parquetMetadata } from '../src/metadata.js'
+import { parquetMetadata, parquetMetadataAsync } from '../src/metadata.js'
 import { toJson } from '../src/toJson.js'

 /**
@ -10,156 +10,35 @@ import { toJson } from '../src/toJson.js'
 * @returns {Promise<ArrayBuffer>}
 */
 async function readFileToArrayBuffer(filePath) {
-  const buffer = await fs.readFile(filePath)
+  const buffer = await fs.promises.readFile(filePath)
  return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)
 }

+/**
+ * Wrap .parquet file in an AsyncBuffer
+ *
+ * @typedef {import('../src/types.js').AsyncBuffer} AsyncBuffer
+ * @param {string} filePath
+ * @returns {AsyncBuffer}
+ */
+function fileToAsyncBuffer(filePath) {
+  return {
+    byteLength: fs.statSync(filePath).size,
+    slice: async (start, end) => (await readFileToArrayBuffer(filePath)).slice(start, end),
+  }
+}
+
 describe('parquetMetadata', () => {
  it('should correctly decode metadata from addrtype-missing-value.parquet', async () => {
    const arrayBuffer = await readFileToArrayBuffer('test/files/addrtype-missing-value.parquet')
    const result = parquetMetadata(arrayBuffer)
-
-    // Parquet v1 from DuckDB
-    const expectedMetadata = {
-      version: 1,
-      created_by: 'DuckDB',
-      metadata_length: 149,
-      schema: [
-        { repetition_type: 0, name: 'duckdb_schema', num_children: 1 },
-        { type: 6, repetition_type: 1, name: 'ADDRTYPE', converted_type: 0 },
-      ],
-      num_rows: 10,
-      row_groups: [
-        {
-          columns: [
-            {
-              file_offset: 0,
-              meta_data: {
-                type: 6,
-                encodings: [0, 8],
-                path_in_schema: ['ADDRTYPE'],
-                codec: 1,
-                num_values: 10,
-                total_uncompressed_size: 78,
-                total_compressed_size: 82,
-                data_page_offset: 31,
-                dictionary_page_offset: 4,
-                statistics: {
-                  max: 'Intersection',
-                  min: 'Block',
-                  null_count: 1,
-                  distinct_count: 2,
-                },
-              },
-            },
-          ],
-          total_byte_size: 33024,
-          num_rows: 10,
-        },
-      ],
-    }
-
-    const casted = toJson(result)
-    expect(casted).toEqual(expectedMetadata)
+    expect(toJson(result)).toEqual(addrtypeMetadata)
  })

  it('should correctly decode metadata from rowgroups.parquet', async () => {
    const arrayBuffer = await readFileToArrayBuffer('test/files/rowgroups.parquet')
    const result = parquetMetadata(arrayBuffer)
-
-    // Parquet v2 from pandas with 2 row groups
-    const expectedMetadata = {
-      version: 2,
-      created_by: 'parquet-cpp-arrow version 14.0.2',
-      metadata_length: 1602,
-      schema: [
-        {
-          repetition_type: 0,
-          name: 'schema',
-          num_children: 1,
-        },
-        {
-          type: 2,
-          repetition_type: 1,
-          name: 'numbers',
-        },
-      ],
-      num_rows: 15,
-      row_groups: [
-        {
-          columns: [
-            {
-              file_offset: 150,
-              file_path: undefined,
-              meta_data: {
-                codec: 1,
-                data_page_offset: 71,
-                dictionary_page_offset: 4,
-                encoding_stats: [
-                  { count: 1, encoding: 0, page_type: 2 },
-                  { count: 1, encoding: 8, page_type: 0 },
-                ],
-                encodings: [0, 3, 8],
-                num_values: 10,
-                path_in_schema: ['numbers'],
-                statistics: {
-                  max: '\n\x00\x00\x00\x00\x00\x00\x00',
-                  min: '\x01\x00\x00\x00\x00\x00\x00\x00',
-                  null_count: 0,
-                },
-                total_compressed_size: 146,
-                total_uncompressed_size: 172,
-                type: 2,
-              },
-            },
-          ],
-          total_byte_size: 172,
-          num_rows: 10,
-        },
-        {
-          columns: [
-            {
-              file_offset: 368,
-              meta_data: {
-                codec: 1,
-                data_page_offset: 294,
-                dictionary_page_offset: 248,
-                encoding_stats: [
-                  { count: 1, encoding: 0, page_type: 2 },
-                  { count: 1, encoding: 8, page_type: 0 },
-                ],
-                encodings: [0, 3, 8],
-                num_values: 5,
-                path_in_schema: ['numbers'],
-                statistics: {
-                  max: '\x0F\x00\x00\x00\x00\x00\x00\x00',
-                  min: '\x0B\x00\x00\x00\x00\x00\x00\x00',
-                  null_count: 0,
-                },
-                total_compressed_size: 120,
-                total_uncompressed_size: 126,
-                type: 2,
-              },
-            },
-          ],
-          total_byte_size: 126,
-          num_rows: 5,
-        },
-      ],
-      key_value_metadata: [
-        {
-          key: 'pandas',
-          // value: json
-        },
-        {
-          key: 'ARROW:schema',
-          // value: base64
-        },
-      ],
-    }
-
-    const casted = toJson(result)
-    expect(casted).containSubset(expectedMetadata)
+    expect(toJson(result)).containSubset(rowgroupsMetadata)
  })

  it('should throw an error for a too short file', () => {
@ -167,8 +46,164 @@ describe('parquetMetadata', () => {
    expect(() => parquetMetadata(arrayBuffer)).toThrow('parquet file is too short')
  })

+  it('should throw an error for invalid metadata length', () => {
+    const arrayBuffer = new ArrayBuffer(12)
+    const view = new DataView(arrayBuffer)
+    view.setUint32(0, 0x31524150, true) // magic number PAR1
+    view.setUint32(4, 1000, true) // 1000 bytes exceeds buffer
+    view.setUint32(8, 0x31524150, true) // magic number PAR1
+    expect(() => parquetMetadata(arrayBuffer))
+      .toThrow('parquet metadata length 1000 exceeds available buffer 4')
+  })
+
  it('should throw an error for invalid magic number', () => {
    const arrayBuffer = new ArrayBuffer(8)
    expect(() => parquetMetadata(arrayBuffer)).toThrow('parquet file invalid magic number')
  })
 })
+
+describe('parquetMetadataAsync', () => {
+  it('should correctly decode metadata from addrtype-missing-value.parquet', async () => {
+    const asyncBuffer = fileToAsyncBuffer('test/files/addrtype-missing-value.parquet')
+    const result = await parquetMetadataAsync(asyncBuffer)
+    expect(toJson(result)).toEqual(addrtypeMetadata)
+  })
+
+  it('should correctly decode metadata from rowgroups.parquet', async () => {
+    const asyncBuffer = fileToAsyncBuffer('test/files/rowgroups.parquet')
+    // force two fetches
+    const result = await parquetMetadataAsync(asyncBuffer, 1609)
+    expect(toJson(result)).containSubset(rowgroupsMetadata)
+  })
+})
+
+// Parquet v1 from DuckDB
+const addrtypeMetadata = {
+  version: 1,
+  created_by: 'DuckDB',
+  metadata_length: 149,
+  schema: [
+    { repetition_type: 0, name: 'duckdb_schema', num_children: 1 },
+    { type: 6, repetition_type: 1, name: 'ADDRTYPE', converted_type: 0 },
+  ],
+  num_rows: 10,
+  row_groups: [
+    {
+      columns: [
+        {
+          file_offset: 0,
+          meta_data: {
+            type: 6,
+            encodings: [0, 8],
+            path_in_schema: ['ADDRTYPE'],
+            codec: 1,
+            num_values: 10,
+            total_uncompressed_size: 78,
+            total_compressed_size: 82,
+            data_page_offset: 31,
+            dictionary_page_offset: 4,
+            statistics: {
+              max: 'Intersection',
+              min: 'Block',
+              null_count: 1,
+              distinct_count: 2,
+            },
+          },
+        },
+      ],
+      total_byte_size: 33024,
+      num_rows: 10,
+    },
+  ],
+}
+
+// Parquet v2 from pandas with 2 row groups
+const rowgroupsMetadata = {
+  version: 2,
+  created_by: 'parquet-cpp-arrow version 14.0.2',
+  metadata_length: 1602,
+  schema: [
+    {
+      repetition_type: 0,
+      name: 'schema',
+      num_children: 1,
+    },
+    {
+      type: 2,
+      repetition_type: 1,
+      name: 'numbers',
+    },
+  ],
+  num_rows: 15,
+  row_groups: [
+    {
+      columns: [
+        {
+          file_offset: 150,
+          file_path: undefined,
+          meta_data: {
+            codec: 1,
+            data_page_offset: 71,
+            dictionary_page_offset: 4,
+            encoding_stats: [
+              { count: 1, encoding: 0, page_type: 2 },
+              { count: 1, encoding: 8, page_type: 0 },
+            ],
+            encodings: [0, 3, 8],
+            num_values: 10,
+            path_in_schema: ['numbers'],
+            statistics: {
+              max: '\n\x00\x00\x00\x00\x00\x00\x00',
+              min: '\x01\x00\x00\x00\x00\x00\x00\x00',
+              null_count: 0,
+            },
+            total_compressed_size: 146,
+            total_uncompressed_size: 172,
+            type: 2,
+          },
+        },
+      ],
+      total_byte_size: 172,
+      num_rows: 10,
+    },
+    {
+      columns: [
+        {
+          file_offset: 368,
+          meta_data: {
+            codec: 1,
+            data_page_offset: 294,
+            dictionary_page_offset: 248,
+            encoding_stats: [
+              { count: 1, encoding: 0, page_type: 2 },
+              { count: 1, encoding: 8, page_type: 0 },
+            ],
+            encodings: [0, 3, 8],
+            num_values: 5,
+            path_in_schema: ['numbers'],
+            statistics: {
+              max: '\x0F\x00\x00\x00\x00\x00\x00\x00',
+              min: '\x0B\x00\x00\x00\x00\x00\x00\x00',
+              null_count: 0,
+            },
+            total_compressed_size: 120,
+            total_uncompressed_size: 126,
+            type: 2,
+          },
+        },
+      ],
+      total_byte_size: 126,
+      num_rows: 5,
+    },
+  ],
+  key_value_metadata: [
+    {
+      key: 'pandas',
+      // value: json
+    },
+    {
+      key: 'ARROW:schema',
+      // value: base64
+    },
+  ],
+}