From c2b48ab2fe97315fdb40d2827d1aaf7ead7604be Mon Sep 17 00:00:00 2001
From: Kenny Daniel <platypii@gmail.com>
Date: Mon, 15 Jan 2024 13:40:12 -0800
Subject: [PATCH] parquetMetadataAsync tests

---
 src/metadata.js       |   3 -
 test/metadata.test.js | 315 +++++++++++++++++++++++-------------------
 2 files changed, 175 insertions(+), 143 deletions(-)
diff --git a/src/metadata.js b/src/metadata.js
index a4374ef..336791b 100644
--- a/src/metadata.js
+++ b/src/metadata.js
@@ -64,9 +64,6 @@ export function parquetMetadata(arrayBuffer) {
   // Metadata length is 4 bytes before the last PAR1
   const metadataLengthOffset = view.byteLength - 8
   const metadataLength = view.getUint32(metadataLengthOffset, true)
-  if (metadataLength <= 0) {
-    throw new Error(`parquet invalid metadata length ${metadataLength}`)
-  }
   if (metadataLength > view.byteLength - 8) {
     // {metadata}, metadata_length, PAR1
     throw new Error(`parquet metadata length ${metadataLength} exceeds available buffer ${view.byteLength - 8}`)
diff --git a/test/metadata.test.js b/test/metadata.test.js
index 1a8746b..ff0bd9c 100644
--- a/test/metadata.test.js
+++ b/test/metadata.test.js
@@ -1,6 +1,6 @@
-import { promises as fs } from 'fs'
+import fs from 'fs'
 import { describe, expect, it } from 'vitest'
-import { parquetMetadata } from '../src/metadata.js'
+import { parquetMetadata, parquetMetadataAsync } from '../src/metadata.js'
 import { toJson } from '../src/toJson.js'
 
 /**
@@ -10,156 +10,35 @@ import { toJson } from '../src/toJson.js'
  * @returns {Promise<ArrayBuffer>}
  */
 async function readFileToArrayBuffer(filePath) {
-  const buffer = await fs.readFile(filePath)
+  const buffer = await fs.promises.readFile(filePath)
   return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)
 }
 
+/**
+ * Wrap .parquet file in an AsyncBuffer
+ *
+ * @typedef {import('../src/types.js').AsyncBuffer} AsyncBuffer
+ * @param {string} filePath
+ * @returns {AsyncBuffer}
+ */
+function fileToAsyncBuffer(filePath) {
+  return {
+    byteLength: fs.statSync(filePath).size,
+    slice: async (start, end) => (await readFileToArrayBuffer(filePath)).slice(start, end),
+  }
+}
+
 describe('parquetMetadata', () => {
   it('should correctly decode metadata from addrtype-missing-value.parquet', async () => {
     const arrayBuffer = await readFileToArrayBuffer('test/files/addrtype-missing-value.parquet')
     const result = parquetMetadata(arrayBuffer)
-
-    // Parquet v1 from DuckDB
-    const expectedMetadata = {
-      version: 1,
-      created_by: 'DuckDB',
-      metadata_length: 149,
-      schema: [
-        { repetition_type: 0, name: 'duckdb_schema', num_children: 1 },
-        { type: 6, repetition_type: 1, name: 'ADDRTYPE', converted_type: 0 },
-      ],
-      num_rows: 10,
-      row_groups: [
-        {
-          columns: [
-            {
-              file_offset: 0,
-              meta_data: {
-                type: 6,
-                encodings: [0, 8],
-                path_in_schema: ['ADDRTYPE'],
-                codec: 1,
-                num_values: 10,
-                total_uncompressed_size: 78,
-                total_compressed_size: 82,
-                data_page_offset: 31,
-                dictionary_page_offset: 4,
-                statistics: {
-                  max: 'Intersection',
-                  min: 'Block',
-                  null_count: 1,
-                  distinct_count: 2,
-                },
-              },
-            },
-          ],
-          total_byte_size: 33024,
-          num_rows: 10,
-        },
-      ],
-    }
-
-    const casted = toJson(result)
-    expect(casted).toEqual(expectedMetadata)
+    expect(toJson(result)).toEqual(addrtypeMetadata)
   })
 
   it('should correctly decode metadata from rowgroups.parquet', async () => {
     const arrayBuffer = await readFileToArrayBuffer('test/files/rowgroups.parquet')
     const result = parquetMetadata(arrayBuffer)
-
-    // Parquet v2 from pandas with 2 row groups
-    const expectedMetadata = {
-      version: 2,
-      created_by: 'parquet-cpp-arrow version 14.0.2',
-      metadata_length: 1602,
-      schema: [
-        {
-          repetition_type: 0,
-          name: 'schema',
-          num_children: 1,
-        },
-        {
-          type: 2,
-          repetition_type: 1,
-          name: 'numbers',
-        },
-      ],
-      num_rows: 15,
-      row_groups: [
-        {
-          columns: [
-            {
-              file_offset: 150,
-              file_path: undefined,
-              meta_data: {
-                codec: 1,
-                data_page_offset: 71,
-                dictionary_page_offset: 4,
-                encoding_stats: [
-                  { count: 1, encoding: 0, page_type: 2 },
-                  { count: 1, encoding: 8, page_type: 0 },
-                ],
-                encodings: [0, 3, 8],
-                num_values: 10,
-                path_in_schema: ['numbers'],
-                statistics: {
-                  max: '\n\x00\x00\x00\x00\x00\x00\x00',
-                  min: '\x01\x00\x00\x00\x00\x00\x00\x00',
-                  null_count: 0,
-                },
-                total_compressed_size: 146,
-                total_uncompressed_size: 172,
-                type: 2,
-              },
-            },
-          ],
-          total_byte_size: 172,
-          num_rows: 10,
-        },
-        {
-          columns: [
-            {
-              file_offset: 368,
-              meta_data: {
-                codec: 1,
-                data_page_offset: 294,
-                dictionary_page_offset: 248,
-                encoding_stats: [
-                  { count: 1, encoding: 0, page_type: 2 },
-                  { count: 1, encoding: 8, page_type: 0 },
-                ],
-                encodings: [0, 3, 8],
-                num_values: 5,
-                path_in_schema: ['numbers'],
-                statistics: {
-                  max: '\x0F\x00\x00\x00\x00\x00\x00\x00',
-                  min: '\x0B\x00\x00\x00\x00\x00\x00\x00',
-                  null_count: 0,
-                },
-                total_compressed_size: 120,
-                total_uncompressed_size: 126,
-                type: 2,
-              },
-            },
-          ],
-          total_byte_size: 126,
-          num_rows: 5,
-        },
-      ],
-      key_value_metadata: [
-        {
-          key: 'pandas',
-          // value: json
-        },
-        {
-          key: 'ARROW:schema',
-          // value: base64
-        },
-      ],
-    }
-
-    const casted = toJson(result)
-    expect(casted).containSubset(expectedMetadata)
+    expect(toJson(result)).containSubset(rowgroupsMetadata)
   })
 
   it('should throw an error for a too short file', () => {
@@ -167,8 +46,164 @@ describe('parquetMetadata', () => {
     expect(() => parquetMetadata(arrayBuffer)).toThrow('parquet file is too short')
   })
 
+  it('should throw an error for invalid metadata length', () => {
+    const arrayBuffer = new ArrayBuffer(12)
+    const view = new DataView(arrayBuffer)
+    view.setUint32(0, 0x31524150, true) // magic number PAR1
+    view.setUint32(4, 1000, true) // 1000 bytes exceeds buffer
+    view.setUint32(8, 0x31524150, true) // magic number PAR1
+    expect(() => parquetMetadata(arrayBuffer))
+      .toThrow('parquet metadata length 1000 exceeds available buffer 4')
+  })
+
   it('should throw an error for invalid magic number', () => {
     const arrayBuffer = new ArrayBuffer(8)
     expect(() => parquetMetadata(arrayBuffer)).toThrow('parquet file invalid magic number')
   })
 })
+
+describe('parquetMetadataAsync', () => {
+  it('should correctly decode metadata from addrtype-missing-value.parquet', async () => {
+    const asyncBuffer = fileToAsyncBuffer('test/files/addrtype-missing-value.parquet')
+    const result = await parquetMetadataAsync(asyncBuffer)
+    expect(toJson(result)).toEqual(addrtypeMetadata)
+  })
+
+  it('should correctly decode metadata from rowgroups.parquet', async () => {
+    const asyncBuffer = fileToAsyncBuffer('test/files/rowgroups.parquet')
+    // force two fetches
+    const result = await parquetMetadataAsync(asyncBuffer, 1609)
+    expect(toJson(result)).containSubset(rowgroupsMetadata)
+  })
+})
+
+// Parquet v1 from DuckDB
+const addrtypeMetadata = {
+  version: 1,
+  created_by: 'DuckDB',
+  metadata_length: 149,
+  schema: [
+    { repetition_type: 0, name: 'duckdb_schema', num_children: 1 },
+    { type: 6, repetition_type: 1, name: 'ADDRTYPE', converted_type: 0 },
+  ],
+  num_rows: 10,
+  row_groups: [
+    {
+      columns: [
+        {
+          file_offset: 0,
+          meta_data: {
+            type: 6,
+            encodings: [0, 8],
+            path_in_schema: ['ADDRTYPE'],
+            codec: 1,
+            num_values: 10,
+            total_uncompressed_size: 78,
+            total_compressed_size: 82,
+            data_page_offset: 31,
+            dictionary_page_offset: 4,
+            statistics: {
+              max: 'Intersection',
+              min: 'Block',
+              null_count: 1,
+              distinct_count: 2,
+            },
+          },
+        },
+      ],
+      total_byte_size: 33024,
+      num_rows: 10,
+    },
+  ],
+}
+
+// Parquet v2 from pandas with 2 row groups
+const rowgroupsMetadata = {
+  version: 2,
+  created_by: 'parquet-cpp-arrow version 14.0.2',
+  metadata_length: 1602,
+  schema: [
+    {
+      repetition_type: 0,
+      name: 'schema',
+      num_children: 1,
+    },
+    {
+      type: 2,
+      repetition_type: 1,
+      name: 'numbers',
+    },
+  ],
+  num_rows: 15,
+  row_groups: [
+    {
+      columns: [
+        {
+          file_offset: 150,
+          file_path: undefined,
+          meta_data: {
+            codec: 1,
+            data_page_offset: 71,
+            dictionary_page_offset: 4,
+            encoding_stats: [
+              { count: 1, encoding: 0, page_type: 2 },
+              { count: 1, encoding: 8, page_type: 0 },
+            ],
+            encodings: [0, 3, 8],
+            num_values: 10,
+            path_in_schema: ['numbers'],
+            statistics: {
+              max: '\n\x00\x00\x00\x00\x00\x00\x00',
+              min: '\x01\x00\x00\x00\x00\x00\x00\x00',
+              null_count: 0,
+            },
+            total_compressed_size: 146,
+            total_uncompressed_size: 172,
+            type: 2,
+          },
+        },
+      ],
+      total_byte_size: 172,
+      num_rows: 10,
+    },
+    {
+      columns: [
+        {
+          file_offset: 368,
+          meta_data: {
+            codec: 1,
+            data_page_offset: 294,
+            dictionary_page_offset: 248,
+            encoding_stats: [
+              { count: 1, encoding: 0, page_type: 2 },
+              { count: 1, encoding: 8, page_type: 0 },
+            ],
+            encodings: [0, 3, 8],
+            num_values: 5,
+            path_in_schema: ['numbers'],
+            statistics: {
+              max: '\x0F\x00\x00\x00\x00\x00\x00\x00',
+              min: '\x0B\x00\x00\x00\x00\x00\x00\x00',
+              null_count: 0,
+            },
+            total_compressed_size: 120,
+            total_uncompressed_size: 126,
+            type: 2,
+          },
+        },
+      ],
+      total_byte_size: 126,
+      num_rows: 5,
+    },
+  ],
+  key_value_metadata: [
+    {
+      key: 'pandas',
+      // value: json
+    },
+    {
+      key: 'ARROW:schema',
+      // value: base64
+    },
+  ],
+}