Metadata tests

This commit is contained in:
Kenny Daniel 2024-01-03 10:33:37 -08:00
parent 88ff71d924
commit 6616c2f7af
No known key found for this signature in database
GPG Key ID: 6A3C5E318BE71391
9 changed files with 100 additions and 13 deletions

@ -10,7 +10,7 @@
"@typescript-eslint/no-explicit-any": "warn",
"@typescript-eslint/no-unused-vars": "warn",
"arrow-spacing": "error",
"camelcase": "error",
"camelcase": "off",
"comma-spacing": "error",
"comma-dangle": ["error", "always-multiline"],
"eol-last": "error",
@ -32,7 +32,7 @@
"jsdoc/check-tag-names": "error",
"jsdoc/no-types": "error",
"jsdoc/sort-tags": "error",
"no-constant-condition": "warn",
"no-constant-condition": "off",
"no-multi-spaces": "error",
"no-trailing-spaces": "error",
"no-var": "error",

@ -4,7 +4,7 @@ on:
push:
jobs:
hyllama:
hyparquet:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3

@ -9,7 +9,7 @@
"license": "MIT",
"repository": {
"type": "git",
"url": "https://github.com/hyparam/hyllama"
"url": "https://github.com/hyparam/hyparquet"
},
"main": "dist/hyparquet.js",
"files": [
@ -24,12 +24,13 @@
"test": "vitest run"
},
"devDependencies": {
"@typescript-eslint/eslint-plugin": "6.16.0",
"@vitest/coverage-v8": "1.1.0",
"@types/node": "20.10.6",
"@typescript-eslint/eslint-plugin": "6.17.0",
"@vitest/coverage-v8": "1.1.1",
"eslint": "8.56.0",
"eslint-plugin-import": "2.29.1",
"eslint-plugin-jsdoc": "46.9.1",
"eslint-plugin-jsdoc": "48.0.2",
"typescript": "5.3.3",
"vitest": "1.1.0"
"vitest": "1.1.1"
}
}

@ -1,4 +1,4 @@
import { parquetMetadata } from './metadata'
import { parquetMetadata } from './metadata.js'
/**
* Read parquet data rows from a file

@ -1,12 +1,11 @@
import { deserializeTCompactProtocol } from './thrift'
import type { FileMetaData, SchemaElement } from './types'
import { deserializeTCompactProtocol } from './thrift.js'
import type { FileMetaData, SchemaElement } from './types.ts'
/**
* Read parquet header, metadata, and schema information from a file
* @param arrayBuffer parquet file contents
* @returns metadata object
*/
/* eslint-disable camelcase */
export function parquetMetadata(arrayBuffer: ArrayBuffer): FileMetaData {
// DataView for easier manipulation of the buffer
const view = new DataView(arrayBuffer)
@ -110,3 +109,22 @@ export function schemaElement(schema: SchemaElement[], name: string[]): any {
}
return element
}
/**
* Replace bigints with numbers.
*/
export function castBigInts(obj: any): any {
if (typeof obj === 'bigint') {
return Number(obj)
} else if (Array.isArray(obj)) {
return obj.map(castBigInts)
} else if (typeof obj === 'object') {
const newObj = {}
for (const key of Object.keys(obj)) {
newObj[key] = castBigInts(obj[key])
}
return newObj
} else {
return obj
}
}

@ -1,4 +1,4 @@
import { Decoded } from './types'
import type { Decoded } from './types.ts'
// TCompactProtocol types
const CompactType = {

Binary file not shown.

67
test/metadata.test.ts Normal file

@ -0,0 +1,67 @@
import { promises as fs } from 'fs'
import { describe, expect, it } from 'vitest'
import { castBigInts, parquetMetadata } from '../src/metadata'
// Helper function to read .parquet file into ArrayBuffer
async function readFileToArrayBuffer(filePath: string): Promise<ArrayBuffer> {
const buffer = await fs.readFile(filePath)
return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength)
}
describe('parquetMetadata', () => {
it('should correctly decode metadata from addrtype-missing-value.parquet', async () => {
const arrayBuffer = await readFileToArrayBuffer('test/files/addrtype-missing-value.parquet')
const result = parquetMetadata(arrayBuffer)
const expectedMetadata = {
version: 1,
schema: [
{ repetition_type: 0, name: 'duckdb_schema', num_children: 1 },
{ type: 6, repetition_type: 1, name: 'ADDRTYPE', converted_type: 0 },
],
num_rows: 10,
row_groups: [
{
columns: [
{
file_offset: 0,
meta_data: {
type: 6,
encodings: [0, 8],
path_in_schema: ['ADDRTYPE'],
codec: 1,
num_values: 10,
total_uncompressed_size: 78,
total_compressed_size: 82,
data_page_offset: 31,
dictionary_page_offset: 4,
statistics: {
max: 'Intersection',
min: 'Block',
null_count: 1,
distinct_count: 2,
},
},
},
],
total_byte_size: 33024,
num_rows: 10,
},
],
created_by: 'DuckDB',
}
const casted = castBigInts(result)
expect(casted).toEqual(expectedMetadata)
})
it('should throw an error for a too short file', () => {
const arrayBuffer = new ArrayBuffer(0)
expect(() => parquetMetadata(arrayBuffer)).toThrow('parquet file is too short')
})
it('should throw an error for invalid magic number', () => {
const arrayBuffer = new ArrayBuffer(8)
expect(() => parquetMetadata(arrayBuffer)).toThrow('parquet file invalid magic number')
})
})

@ -2,6 +2,7 @@
"compilerOptions": {
"declaration": true,
"lib": ["esnext", "dom"],
"module": "nodenext",
"outDir": "dist",
"sourceMap": true,
"target": "esnext",