diff --git a/package.json b/package.json index e5f5015..dc273e1 100644 --- a/package.json +++ b/package.json @@ -28,7 +28,7 @@ }, "devDependencies": { "@types/node": "20.11.17", - "@typescript-eslint/eslint-plugin": "6.21.0", + "@typescript-eslint/eslint-plugin": "7.0.1", "@vitest/coverage-v8": "1.2.2", "eslint": "8.56.0", "eslint-plugin-import": "2.29.1", diff --git a/test/files/addrtype-missing-value.metadata.json b/test/files/addrtype-missing-value.metadata.json new file mode 100644 index 0000000..14e7beb --- /dev/null +++ b/test/files/addrtype-missing-value.metadata.json @@ -0,0 +1,47 @@ +{ + "version": 1, + "created_by": "DuckDB", + "metadata_length": 149, + "schema": [ + { + "repetition_type": "REQUIRED", + "name": "duckdb_schema", + "num_children": 1 + }, + { + "type": 6, + "repetition_type": "OPTIONAL", + "name": "ADDRTYPE", + "converted_type": "UTF8" + } + ], + "num_rows": 10, + "row_groups": [ + { + "columns": [ + { + "file_offset": 0, + "meta_data": { + "type": 6, + "encodings": [0, 8], + "path_in_schema": ["ADDRTYPE"], + "codec": "SNAPPY", + "num_values": 10, + "total_uncompressed_size": 78, + "total_compressed_size": 82, + "data_page_offset": 31, + "dictionary_page_offset": 4, + "statistics": { + "max": "Intersection", + "min": "Block", + "null_count": 1, + "distinct_count": 2 + } + } + } + ], + "total_byte_size": 33024, + "num_rows": 10 + } + ] +} diff --git a/test/files/rowgroups.metadata.json b/test/files/rowgroups.metadata.json new file mode 100644 index 0000000..7588211 --- /dev/null +++ b/test/files/rowgroups.metadata.json @@ -0,0 +1,86 @@ +{ + "version": 2, + "created_by": "parquet-cpp-arrow version 14.0.2", + "metadata_length": 1602, + "schema": [ + { + "repetition_type": "REQUIRED", + "name": "schema", + "num_children": 1 + }, + { + "type": 2, + "repetition_type": "OPTIONAL", + "name": "numbers" + } + ], + "num_rows": 15, + "row_groups": [ + { + "columns": [ + { + "file_offset": 150, + "meta_data": { + "codec": "SNAPPY", + "data_page_offset": 71, + "dictionary_page_offset": 4, + "encoding_stats": [ + { "count": 1, "encoding": 0, "page_type": 2 }, + { "count": 1, "encoding": 8, "page_type": 0 } + ], + "encodings": [0, 3, 8], + "num_values": 10, + "path_in_schema": ["numbers"], + "statistics": { + "max": "\n\u0000\u0000\u0000\u0000\u0000\u0000\u0000", + "min": "\u0001\u0000\u0000\u0000\u0000\u0000\u0000\u0000", + "null_count": 0 + }, + "total_compressed_size": 146, + "total_uncompressed_size": 172, + "type": 2 + } + } + ], + "total_byte_size": 172, + "num_rows": 10 + }, + { + "columns": [ + { + "file_offset": 368, + "meta_data": { + "codec": "SNAPPY", + "data_page_offset": 294, + "dictionary_page_offset": 248, + "encoding_stats": [ + { "count": 1, "encoding": 0, "page_type": 2 }, + { "count": 1, "encoding": 8, "page_type": 0 } + ], + "encodings": [0, 3, 8], + "num_values": 5, + "path_in_schema": ["numbers"], + "statistics": { + "max": "\u000f\u0000\u0000\u0000\u0000\u0000\u0000\u0000", + "min": "\u000b\u0000\u0000\u0000\u0000\u0000\u0000\u0000", + "null_count": 0 + }, + "total_compressed_size": 120, + "total_uncompressed_size": 126, + "type": 2 + } + } + ], + "total_byte_size": 126, + "num_rows": 5 + } + ], + "key_value_metadata": [ + { + "key": "pandas" + }, + { + "key": "ARROW:schema" + } + ] +} diff --git a/test/helpers.js b/test/helpers.js index ccad905..066ea6c 100644 --- a/test/helpers.js +++ b/test/helpers.js @@ -24,3 +24,14 @@ export function fileToAsyncBuffer(filePath) { slice: async (start, end) => (await readFileToArrayBuffer(filePath)).slice(start, end), } } + +/** + * Read .parquet file into JSON + * + * @param {string} filePath + * @returns {any} + */ +export function fileToJson(filePath) { + const buffer = fs.readFileSync(filePath) + return JSON.parse(buffer.toString()) +} diff --git a/test/metadata.test.js b/test/metadata.test.js index 6d92f50..54c1e2a 100644 --- a/test/metadata.test.js +++ b/test/metadata.test.js @@ -1,19 +1,20 @@ +import fs from 'fs' import { describe, expect, it } from 'vitest' import { parquetMetadata, parquetMetadataAsync } from '../src/hyparquet.js' import { toJson } from '../src/toJson.js' -import { fileToAsyncBuffer, readFileToArrayBuffer } from './helpers.js' +import { fileToAsyncBuffer, fileToJson, readFileToArrayBuffer } from './helpers.js' describe('parquetMetadata', () => { - it('should parse metadata from addrtype-missing-value.parquet', async () => { - const arrayBuffer = await readFileToArrayBuffer('test/files/addrtype-missing-value.parquet') - const result = parquetMetadata(arrayBuffer) - expect(toJson(result)).toEqual(addrtypeMetadata) - }) - - it('should parse metadata from rowgroups.parquet', async () => { - const arrayBuffer = await readFileToArrayBuffer('test/files/rowgroups.parquet') - const result = parquetMetadata(arrayBuffer) - expect(toJson(result)).containSubset(rowgroupsMetadata) + it('should parse metadata from all test files', async () => { + const files = fs.readdirSync('test/files') + for (const file of files) { + if (!file.endsWith('.parquet')) continue + const arrayBuffer = await readFileToArrayBuffer(`test/files/${file}`) + const result = parquetMetadata(arrayBuffer) + const base = file.replace('.parquet', '') + const expected = fileToJson(`test/files/${base}.metadata.json`) + expect(toJson(result)).containSubset(expected) + } }) it('should throw an error for a too short file', () => { @@ -45,17 +46,16 @@ describe('parquetMetadata', () => { }) describe('parquetMetadataAsync', () => { - it('should parse metadata asynchronously from addrtype-missing-value.parquet', async () => { - const asyncBuffer = fileToAsyncBuffer('test/files/addrtype-missing-value.parquet') - const result = await parquetMetadataAsync(asyncBuffer) - expect(toJson(result)).toEqual(addrtypeMetadata) - }) - - it('should parse metadata asynchronously from rowgroups.parquet', async () => { - const asyncBuffer = fileToAsyncBuffer('test/files/rowgroups.parquet') - // force two fetches - const result = await parquetMetadataAsync(asyncBuffer, 1609) - expect(toJson(result)).containSubset(rowgroupsMetadata) + it('should parse metadata asynchronously from all test files', async () => { + const files = fs.readdirSync('test/files') + for (const file of files) { + if (!file.endsWith('.parquet')) continue + const asyncBuffer = fileToAsyncBuffer(`test/files/${file}`) + const result = await parquetMetadataAsync(asyncBuffer) + const base = file.replace('.parquet', '') + const expected = fileToJson(`test/files/${base}.metadata.json`) + expect(toJson(result)).containSubset(expected) + } }) it('should throw an error for invalid magic number', () => { @@ -70,143 +70,3 @@ describe('parquetMetadataAsync', () => { .toThrow('parquet metadata length 4294967295 exceeds available buffer 0') }) }) - -// Parquet v1 from DuckDB -const addrtypeMetadata = { - version: 1, - created_by: 'DuckDB', - metadata_length: 149, - schema: [ - { - repetition_type: 'REQUIRED', - name: 'duckdb_schema', - num_children: 1, - }, - { - type: 6, - repetition_type: 'OPTIONAL', - name: 'ADDRTYPE', - converted_type: 'UTF8', - }, - ], - num_rows: 10, - row_groups: [ - { - columns: [ - { - file_offset: 0, - meta_data: { - type: 6, - encodings: [0, 8], - path_in_schema: ['ADDRTYPE'], - codec: 'SNAPPY', - num_values: 10, - total_uncompressed_size: 78, - total_compressed_size: 82, - data_page_offset: 31, - dictionary_page_offset: 4, - statistics: { - max: 'Intersection', - min: 'Block', - null_count: 1, - distinct_count: 2, - }, - }, - }, - ], - total_byte_size: 33024, - num_rows: 10, - }, - ], -} - -// Parquet v2 from pandas with 2 row groups -const rowgroupsMetadata = { - version: 2, - created_by: 'parquet-cpp-arrow version 14.0.2', - metadata_length: 1602, - schema: [ - { - repetition_type: 'REQUIRED', - name: 'schema', - num_children: 1, - }, - { - type: 2, - repetition_type: 'OPTIONAL', - name: 'numbers', - }, - ], - num_rows: 15, - row_groups: [ - { - columns: [ - { - file_offset: 150, - file_path: undefined, - meta_data: { - codec: 'SNAPPY', - data_page_offset: 71, - dictionary_page_offset: 4, - encoding_stats: [ - { count: 1, encoding: 0, page_type: 2 }, - { count: 1, encoding: 8, page_type: 0 }, - ], - encodings: [0, 3, 8], - num_values: 10, - path_in_schema: ['numbers'], - statistics: { - max: '\n\x00\x00\x00\x00\x00\x00\x00', - min: '\x01\x00\x00\x00\x00\x00\x00\x00', - null_count: 0, - }, - total_compressed_size: 146, - total_uncompressed_size: 172, - type: 2, - }, - }, - ], - total_byte_size: 172, - num_rows: 10, - }, - { - columns: [ - { - file_offset: 368, - meta_data: { - codec: 'SNAPPY', - data_page_offset: 294, - dictionary_page_offset: 248, - encoding_stats: [ - { count: 1, encoding: 0, page_type: 2 }, - { count: 1, encoding: 8, page_type: 0 }, - ], - encodings: [0, 3, 8], - num_values: 5, - path_in_schema: ['numbers'], - statistics: { - max: '\x0F\x00\x00\x00\x00\x00\x00\x00', - min: '\x0B\x00\x00\x00\x00\x00\x00\x00', - null_count: 0, - }, - total_compressed_size: 120, - total_uncompressed_size: 126, - type: 2, - }, - }, - ], - total_byte_size: 126, - num_rows: 5, - }, - ], - key_value_metadata: [ - { - key: 'pandas', - // value: json - }, - { - key: 'ARROW:schema', - // value: base64 - }, - ], -}