Test metadata of all sample files

2026-02-21 11:51:32 +00:00 · 2024-02-13 10:52:29 -08:00 · 2024-02-13 10:52:29 -08:00 · 2cf00da9aa
commit 2cf00da9aa
parent 514014d08b
5 changed files with 167 additions and 163 deletions
--- a/package.json
+++ b/package.json
@ -28,7 +28,7 @@
  },
  "devDependencies": {
    "@types/node": "20.11.17",
-    "@typescript-eslint/eslint-plugin": "6.21.0",
+    "@typescript-eslint/eslint-plugin": "7.0.1",
    "@vitest/coverage-v8": "1.2.2",
    "eslint": "8.56.0",
    "eslint-plugin-import": "2.29.1",
--- a/test/files/addrtype-missing-value.metadata.json
+++ b/test/files/addrtype-missing-value.metadata.json
@ -0,0 +1,47 @@
+{
+  "version": 1,
+  "created_by": "DuckDB",
+  "metadata_length": 149,
+  "schema": [
+    {
+      "repetition_type": "REQUIRED",
+      "name": "duckdb_schema",
+      "num_children": 1
+    },
+    {
+      "type": 6,
+      "repetition_type": "OPTIONAL",
+      "name": "ADDRTYPE",
+      "converted_type": "UTF8"
+    }
+  ],
+  "num_rows": 10,
+  "row_groups": [
+    {
+      "columns": [
+        {
+          "file_offset": 0,
+          "meta_data": {
+            "type": 6,
+            "encodings": [0, 8],
+            "path_in_schema": ["ADDRTYPE"],
+            "codec": "SNAPPY",
+            "num_values": 10,
+            "total_uncompressed_size": 78,
+            "total_compressed_size": 82,
+            "data_page_offset": 31,
+            "dictionary_page_offset": 4,
+            "statistics": {
+              "max": "Intersection",
+              "min": "Block",
+              "null_count": 1,
+              "distinct_count": 2
+            }
+          }
+        }
+      ],
+      "total_byte_size": 33024,
+      "num_rows": 10
+    }
+  ]
+}
--- a/test/files/rowgroups.metadata.json
+++ b/test/files/rowgroups.metadata.json
@ -0,0 +1,86 @@
+{
+  "version": 2,
+  "created_by": "parquet-cpp-arrow version 14.0.2",
+  "metadata_length": 1602,
+  "schema": [
+    {
+      "repetition_type": "REQUIRED",
+      "name": "schema",
+      "num_children": 1
+    },
+    {
+      "type": 2,
+      "repetition_type": "OPTIONAL",
+      "name": "numbers"
+    }
+  ],
+  "num_rows": 15,
+  "row_groups": [
+    {
+      "columns": [
+        {
+          "file_offset": 150,
+          "meta_data": {
+            "codec": "SNAPPY",
+            "data_page_offset": 71,
+            "dictionary_page_offset": 4,
+            "encoding_stats": [
+              { "count": 1, "encoding": 0, "page_type": 2 },
+              { "count": 1, "encoding": 8, "page_type": 0 }
+            ],
+            "encodings": [0, 3, 8],
+            "num_values": 10,
+            "path_in_schema": ["numbers"],
+            "statistics": {
+              "max": "\n\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
+              "min": "\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
+              "null_count": 0
+            },
+            "total_compressed_size": 146,
+            "total_uncompressed_size": 172,
+            "type": 2
+          }
+        }
+      ],
+      "total_byte_size": 172,
+      "num_rows": 10
+    },
+    {
+      "columns": [
+        {
+          "file_offset": 368,
+          "meta_data": {
+            "codec": "SNAPPY",
+            "data_page_offset": 294,
+            "dictionary_page_offset": 248,
+            "encoding_stats": [
+              { "count": 1, "encoding": 0, "page_type": 2 },
+              { "count": 1, "encoding": 8, "page_type": 0 }
+            ],
+            "encodings": [0, 3, 8],
+            "num_values": 5,
+            "path_in_schema": ["numbers"],
+            "statistics": {
+              "max": "\u000f\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
+              "min": "\u000b\u0000\u0000\u0000\u0000\u0000\u0000\u0000",
+              "null_count": 0
+            },
+            "total_compressed_size": 120,
+            "total_uncompressed_size": 126,
+            "type": 2
+          }
+        }
+      ],
+      "total_byte_size": 126,
+      "num_rows": 5
+    }
+  ],
+  "key_value_metadata": [
+    {
+      "key": "pandas"
+    },
+    {
+      "key": "ARROW:schema"
+    }
+  ]
+}
--- a/test/helpers.js
+++ b/test/helpers.js
@ -24,3 +24,14 @@ export function fileToAsyncBuffer(filePath) {
    slice: async (start, end) => (await readFileToArrayBuffer(filePath)).slice(start, end),
  }
 }
+
+/**
+ * Read .parquet file into JSON
+ *
+ * @param {string} filePath
+ * @returns {any}
+ */
+export function fileToJson(filePath) {
+  const buffer = fs.readFileSync(filePath)
+  return JSON.parse(buffer.toString())
+}
--- a/test/metadata.test.js
+++ b/test/metadata.test.js
@ -1,19 +1,20 @@
+import fs from 'fs'
 import { describe, expect, it } from 'vitest'
 import { parquetMetadata, parquetMetadataAsync } from '../src/hyparquet.js'
 import { toJson } from '../src/toJson.js'
-import { fileToAsyncBuffer, readFileToArrayBuffer } from './helpers.js'
+import { fileToAsyncBuffer, fileToJson, readFileToArrayBuffer } from './helpers.js'

 describe('parquetMetadata', () => {
-  it('should parse metadata from addrtype-missing-value.parquet', async () => {
-    const arrayBuffer = await readFileToArrayBuffer('test/files/addrtype-missing-value.parquet')
-    const result = parquetMetadata(arrayBuffer)
-    expect(toJson(result)).toEqual(addrtypeMetadata)
-  })
-
-  it('should parse metadata from rowgroups.parquet', async () => {
-    const arrayBuffer = await readFileToArrayBuffer('test/files/rowgroups.parquet')
-    const result = parquetMetadata(arrayBuffer)
-    expect(toJson(result)).containSubset(rowgroupsMetadata)
+  it('should parse metadata from all test files', async () => {
+    const files = fs.readdirSync('test/files')
+    for (const file of files) {
+      if (!file.endsWith('.parquet')) continue
+      const arrayBuffer = await readFileToArrayBuffer(`test/files/${file}`)
+      const result = parquetMetadata(arrayBuffer)
+      const base = file.replace('.parquet', '')
+      const expected = fileToJson(`test/files/${base}.metadata.json`)
+      expect(toJson(result)).containSubset(expected)
+    }
  })

  it('should throw an error for a too short file', () => {
@ -45,17 +46,16 @@ describe('parquetMetadata', () => {
 })

 describe('parquetMetadataAsync', () => {
-  it('should parse metadata asynchronously from addrtype-missing-value.parquet', async () => {
-    const asyncBuffer = fileToAsyncBuffer('test/files/addrtype-missing-value.parquet')
-    const result = await parquetMetadataAsync(asyncBuffer)
-    expect(toJson(result)).toEqual(addrtypeMetadata)
-  })
-
-  it('should parse metadata asynchronously from rowgroups.parquet', async () => {
-    const asyncBuffer = fileToAsyncBuffer('test/files/rowgroups.parquet')
-    // force two fetches
-    const result = await parquetMetadataAsync(asyncBuffer, 1609)
-    expect(toJson(result)).containSubset(rowgroupsMetadata)
+  it('should parse metadata asynchronously from all test files', async () => {
+    const files = fs.readdirSync('test/files')
+    for (const file of files) {
+      if (!file.endsWith('.parquet')) continue
+      const asyncBuffer = fileToAsyncBuffer(`test/files/${file}`)
+      const result = await parquetMetadataAsync(asyncBuffer)
+      const base = file.replace('.parquet', '')
+      const expected = fileToJson(`test/files/${base}.metadata.json`)
+      expect(toJson(result)).containSubset(expected)
+    }
  })

  it('should throw an error for invalid magic number', () => {
@ -70,143 +70,3 @@ describe('parquetMetadataAsync', () => {
      .toThrow('parquet metadata length 4294967295 exceeds available buffer 0')
  })
 })
-
-// Parquet v1 from DuckDB
-const addrtypeMetadata = {
-  version: 1,
-  created_by: 'DuckDB',
-  metadata_length: 149,
-  schema: [
-    {
-      repetition_type: 'REQUIRED',
-      name: 'duckdb_schema',
-      num_children: 1,
-    },
-    {
-      type: 6,
-      repetition_type: 'OPTIONAL',
-      name: 'ADDRTYPE',
-      converted_type: 'UTF8',
-    },
-  ],
-  num_rows: 10,
-  row_groups: [
-    {
-      columns: [
-        {
-          file_offset: 0,
-          meta_data: {
-            type: 6,
-            encodings: [0, 8],
-            path_in_schema: ['ADDRTYPE'],
-            codec: 'SNAPPY',
-            num_values: 10,
-            total_uncompressed_size: 78,
-            total_compressed_size: 82,
-            data_page_offset: 31,
-            dictionary_page_offset: 4,
-            statistics: {
-              max: 'Intersection',
-              min: 'Block',
-              null_count: 1,
-              distinct_count: 2,
-            },
-          },
-        },
-      ],
-      total_byte_size: 33024,
-      num_rows: 10,
-    },
-  ],
-}
-
-// Parquet v2 from pandas with 2 row groups
-const rowgroupsMetadata = {
-  version: 2,
-  created_by: 'parquet-cpp-arrow version 14.0.2',
-  metadata_length: 1602,
-  schema: [
-    {
-      repetition_type: 'REQUIRED',
-      name: 'schema',
-      num_children: 1,
-    },
-    {
-      type: 2,
-      repetition_type: 'OPTIONAL',
-      name: 'numbers',
-    },
-  ],
-  num_rows: 15,
-  row_groups: [
-    {
-      columns: [
-        {
-          file_offset: 150,
-          file_path: undefined,
-          meta_data: {
-            codec: 'SNAPPY',
-            data_page_offset: 71,
-            dictionary_page_offset: 4,
-            encoding_stats: [
-              { count: 1, encoding: 0, page_type: 2 },
-              { count: 1, encoding: 8, page_type: 0 },
-            ],
-            encodings: [0, 3, 8],
-            num_values: 10,
-            path_in_schema: ['numbers'],
-            statistics: {
-              max: '\n\x00\x00\x00\x00\x00\x00\x00',
-              min: '\x01\x00\x00\x00\x00\x00\x00\x00',
-              null_count: 0,
-            },
-            total_compressed_size: 146,
-            total_uncompressed_size: 172,
-            type: 2,
-          },
-        },
-      ],
-      total_byte_size: 172,
-      num_rows: 10,
-    },
-    {
-      columns: [
-        {
-          file_offset: 368,
-          meta_data: {
-            codec: 'SNAPPY',
-            data_page_offset: 294,
-            dictionary_page_offset: 248,
-            encoding_stats: [
-              { count: 1, encoding: 0, page_type: 2 },
-              { count: 1, encoding: 8, page_type: 0 },
-            ],
-            encodings: [0, 3, 8],
-            num_values: 5,
-            path_in_schema: ['numbers'],
-            statistics: {
-              max: '\x0F\x00\x00\x00\x00\x00\x00\x00',
-              min: '\x0B\x00\x00\x00\x00\x00\x00\x00',
-              null_count: 0,
-            },
-            total_compressed_size: 120,
-            total_uncompressed_size: 126,
-            type: 2,
-          },
-        },
-      ],
-      total_byte_size: 126,
-      num_rows: 5,
-    },
-  ],
-  key_value_metadata: [
-    {
-      key: 'pandas',
-      // value: json
-    },
-    {
-      key: 'ARROW:schema',
-      // value: base64
-    },
-  ],
-}