hyparquet/test/metadata.test.js

import { promises as fs } from 'fs'
import { describe, expect, it } from 'vitest'
import { parquetMetadata } from '../src/metadata.js'
import { toJson } from '../src/toJson.js'

/**
 * Helper function to read .parquet file into ArrayBuffer
 *
 * @param {string} filePath
 * @returns {Promise<ArrayBuffer>}
 */
async function readFileToArrayBuffer(filePath) {
  const buffer = await fs.readFile(filePath)
  return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)
}

describe('parquetMetadata', () => {
  it('should correctly decode metadata from addrtype-missing-value.parquet', async () => {
    const arrayBuffer = await readFileToArrayBuffer('test/files/addrtype-missing-value.parquet')
    const result = parquetMetadata(arrayBuffer)

    // Parquet v1 from DuckDB
    const expectedMetadata = {
      version: 1,
      created_by: 'DuckDB',
      metadata_length: 149,
      schema: [
        { repetition_type: 0, name: 'duckdb_schema', num_children: 1 },
        { type: 6, repetition_type: 1, name: 'ADDRTYPE', converted_type: 0 },
      ],
      num_rows: 10,
      row_groups: [
        {
          columns: [
            {
              file_offset: 0,
              meta_data: {
                type: 6,
                encodings: [0, 8],
                path_in_schema: ['ADDRTYPE'],
                codec: 1,
                num_values: 10,
                total_uncompressed_size: 78,
                total_compressed_size: 82,
                data_page_offset: 31,
                dictionary_page_offset: 4,
                statistics: {
                  max: 'Intersection',
                  min: 'Block',
                  null_count: 1,
                  distinct_count: 2,
                },
              },
            },
          ],
          total_byte_size: 33024,
          num_rows: 10,
        },
      ],
    }

    const casted = toJson(result)
    expect(casted).toEqual(expectedMetadata)
  })

  it('should correctly decode metadata from rowgroups.parquet', async () => {
    const arrayBuffer = await readFileToArrayBuffer('test/files/rowgroups.parquet')
    const result = parquetMetadata(arrayBuffer)

    // Parquet v2 from pandas with 2 row groups
    const expectedMetadata = {
      version: 2,
      created_by: 'parquet-cpp-arrow version 14.0.2',
      metadata_length: 1602,
      schema: [
        {
          repetition_type: 0,
          name: 'schema',
          num_children: 1,
        },
        {
          type: 2,
          repetition_type: 1,
          name: 'numbers',
        },
      ],
      num_rows: 15,
      row_groups: [
        {
          columns: [
            {
              file_offset: 150,
              file_path: undefined,
              meta_data: {
                codec: 1,
                data_page_offset: 71,
                dictionary_page_offset: 4,
                encoding_stats: [
                  { count: 1, encoding: 0, page_type: 2 },
                  { count: 1, encoding: 8, page_type: 0 },
                ],
                encodings: [0, 3, 8],
                num_values: 10,
                path_in_schema: ['numbers'],
                statistics: {
                  max: '\n\x00\x00\x00\x00\x00\x00\x00',
                  min: '\x01\x00\x00\x00\x00\x00\x00\x00',
                  null_count: 0,
                },
                total_compressed_size: 146,
                total_uncompressed_size: 172,
                type: 2,
              },
            },
          ],
          total_byte_size: 172,
          num_rows: 10,
        },
        {
          columns: [
            {
              file_offset: 368,
              meta_data: {
                codec: 1,
                data_page_offset: 294,
                dictionary_page_offset: 248,
                encoding_stats: [
                  { count: 1, encoding: 0, page_type: 2 },
                  { count: 1, encoding: 8, page_type: 0 },
                ],
                encodings: [0, 3, 8],
                num_values: 5,
                path_in_schema: ['numbers'],
                statistics: {
                  max: '\x0F\x00\x00\x00\x00\x00\x00\x00',
                  min: '\x0B\x00\x00\x00\x00\x00\x00\x00',
                  null_count: 0,
                },
                total_compressed_size: 120,
                total_uncompressed_size: 126,
                type: 2,
              },
            },
          ],
          total_byte_size: 126,
          num_rows: 5,
        },
      ],
      key_value_metadata: [
        {
          key: 'pandas',
          // value: json
        },
        {
          key: 'ARROW:schema',
          // value: base64
        },
      ],
    }

    const casted = toJson(result)
    expect(casted).containSubset(expectedMetadata)
  })

  it('should throw an error for a too short file', () => {
    const arrayBuffer = new ArrayBuffer(0)
    expect(() => parquetMetadata(arrayBuffer)).toThrow('parquet file is too short')
  })

  it('should throw an error for invalid magic number', () => {
    const arrayBuffer = new ArrayBuffer(8)
    expect(() => parquetMetadata(arrayBuffer)).toThrow('parquet file invalid magic number')
  })
})
Metadata tests 2024-01-03 18:33:37 +00:00			`import { promises as fs } from 'fs'`
			`import { describe, expect, it } from 'vitest'`
Export toJson 2024-01-05 09:39:59 +00:00			`import { parquetMetadata } from '../src/metadata.js'`
			`import { toJson } from '../src/toJson.js'`
Metadata tests 2024-01-03 18:33:37 +00:00
JS tests 2024-01-03 21:07:08 +00:00			`/**`
			`* Helper function to read .parquet file into ArrayBuffer`
			`*`
			`* @param {string} filePath`
			`* @returns {Promise<ArrayBuffer>}`
			`*/`
			`async function readFileToArrayBuffer(filePath) {`
Metadata tests 2024-01-03 18:33:37 +00:00			`const buffer = await fs.readFile(filePath)`
			`return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)`
			`}`

			`describe('parquetMetadata', () => {`
			`it('should correctly decode metadata from addrtype-missing-value.parquet', async () => {`
			`const arrayBuffer = await readFileToArrayBuffer('test/files/addrtype-missing-value.parquet')`
			`const result = parquetMetadata(arrayBuffer)`

Metadata test rowgroups.parquet 2024-01-11 19:06:37 +00:00			`// Parquet v1 from DuckDB`
Metadata tests 2024-01-03 18:33:37 +00:00			`const expectedMetadata = {`
			`version: 1,`
Include metadata length in metadata 2024-01-12 22:35:20 +00:00			`created_by: 'DuckDB',`
			`metadata_length: 149,`
Metadata tests 2024-01-03 18:33:37 +00:00			`schema: [`
			`{ repetition_type: 0, name: 'duckdb_schema', num_children: 1 },`
			`{ type: 6, repetition_type: 1, name: 'ADDRTYPE', converted_type: 0 },`
			`],`
			`num_rows: 10,`
			`row_groups: [`
			`{`
			`columns: [`
			`{`
			`file_offset: 0,`
			`meta_data: {`
			`type: 6,`
			`encodings: [0, 8],`
			`path_in_schema: ['ADDRTYPE'],`
			`codec: 1,`
			`num_values: 10,`
			`total_uncompressed_size: 78,`
			`total_compressed_size: 82,`
			`data_page_offset: 31,`
			`dictionary_page_offset: 4,`
			`statistics: {`
			`max: 'Intersection',`
			`min: 'Block',`
			`null_count: 1,`
			`distinct_count: 2,`
			`},`
			`},`
			`},`
			`],`
			`total_byte_size: 33024,`
			`num_rows: 10,`
			`},`
			`],`
			`}`

Add parquest metadata demo html 2024-01-04 17:27:47 +00:00			`const casted = toJson(result)`
Metadata tests 2024-01-03 18:33:37 +00:00			`expect(casted).toEqual(expectedMetadata)`
			`})`

Metadata test rowgroups.parquet 2024-01-11 19:06:37 +00:00			`it('should correctly decode metadata from rowgroups.parquet', async () => {`
			`const arrayBuffer = await readFileToArrayBuffer('test/files/rowgroups.parquet')`
			`const result = parquetMetadata(arrayBuffer)`

			`// Parquet v2 from pandas with 2 row groups`
			`const expectedMetadata = {`
			`version: 2,`
Include metadata length in metadata 2024-01-12 22:35:20 +00:00			`created_by: 'parquet-cpp-arrow version 14.0.2',`
			`metadata_length: 1602,`
Metadata test rowgroups.parquet 2024-01-11 19:06:37 +00:00			`schema: [`
			`{`
			`repetition_type: 0,`
			`name: 'schema',`
			`num_children: 1,`
			`},`
			`{`
			`type: 2,`
			`repetition_type: 1,`
			`name: 'numbers',`
			`},`
			`],`
			`num_rows: 15,`
			`row_groups: [`
			`{`
			`columns: [`
			`{`
			`file_offset: 150,`
			`file_path: undefined,`
			`meta_data: {`
			`codec: 1,`
			`data_page_offset: 71,`
			`dictionary_page_offset: 4,`
			`encoding_stats: [`
			`{ count: 1, encoding: 0, page_type: 2 },`
			`{ count: 1, encoding: 8, page_type: 0 },`
			`],`
			`encodings: [0, 3, 8],`
			`num_values: 10,`
			`path_in_schema: ['numbers'],`
			`statistics: {`
			`max: '\n\x00\x00\x00\x00\x00\x00\x00',`
			`min: '\x01\x00\x00\x00\x00\x00\x00\x00',`
			`null_count: 0,`
			`},`
			`total_compressed_size: 146,`
			`total_uncompressed_size: 172,`
			`type: 2,`
			`},`
			`},`
			`],`
			`total_byte_size: 172,`
			`num_rows: 10,`
			`},`
			`{`
			`columns: [`
			`{`
			`file_offset: 368,`
			`meta_data: {`
			`codec: 1,`
			`data_page_offset: 294,`
			`dictionary_page_offset: 248,`
			`encoding_stats: [`
			`{ count: 1, encoding: 0, page_type: 2 },`
			`{ count: 1, encoding: 8, page_type: 0 },`
			`],`
			`encodings: [0, 3, 8],`
			`num_values: 5,`
			`path_in_schema: ['numbers'],`
			`statistics: {`
			`max: '\x0F\x00\x00\x00\x00\x00\x00\x00',`
			`min: '\x0B\x00\x00\x00\x00\x00\x00\x00',`
			`null_count: 0,`
			`},`
			`total_compressed_size: 120,`
			`total_uncompressed_size: 126,`
			`type: 2,`
			`},`
			`},`
			`],`
			`total_byte_size: 126,`
			`num_rows: 5,`
			`},`
			`],`
			`key_value_metadata: [`
			`{`
			`key: 'pandas',`
			`// value: json`
			`},`
			`{`
			`key: 'ARROW:schema',`
			`// value: base64`
			`},`
			`],`
			`}`

			`const casted = toJson(result)`
			`expect(casted).containSubset(expectedMetadata)`
			`})`

Metadata tests 2024-01-03 18:33:37 +00:00			`it('should throw an error for a too short file', () => {`
			`const arrayBuffer = new ArrayBuffer(0)`
			`expect(() => parquetMetadata(arrayBuffer)).toThrow('parquet file is too short')`
			`})`

			`it('should throw an error for invalid magic number', () => {`
			`const arrayBuffer = new ArrayBuffer(8)`
			`expect(() => parquetMetadata(arrayBuffer)).toThrow('parquet file invalid magic number')`
			`})`
			`})`