diff --git a/package.json b/package.json index cea5a62..184bf3b 100644 --- a/package.json +++ b/package.json @@ -27,14 +27,14 @@ "typecheck": "tsc" }, "devDependencies": { - "@types/node": "20.11.3", + "@types/node": "20.11.5", "@typescript-eslint/eslint-plugin": "6.19.0", - "@vitest/coverage-v8": "1.2.0", + "@vitest/coverage-v8": "1.2.1", "eslint": "8.56.0", "eslint-plugin-import": "2.29.1", "eslint-plugin-jsdoc": "48.0.2", "http-server": "14.1.1", "typescript": "5.3.3", - "vitest": "1.2.0" + "vitest": "1.2.1" } } diff --git a/src/hyparquet.d.ts b/src/hyparquet.d.ts index 3dcc0fe..85d52cb 100644 --- a/src/hyparquet.d.ts +++ b/src/hyparquet.d.ts @@ -1,4 +1,4 @@ -export { AsyncBuffer, FileMetaData } from './types' +export { AsyncBuffer, FileMetaData, SchemaTree } from './types' /** * Read parquet data rows from a file-like object. @@ -38,7 +38,7 @@ export async function parquetRead(options: ParquetReadOptions): Promise * * @param {AsyncBuffer} asyncBuffer parquet file contents * @param {number} initialFetchSize initial fetch size in bytes (default 512kb) - * @returns {Promise} metadata object + * @returns {Promise} parquet metadata object */ export async function parquetMetadataAsync(asyncBuffer: AsyncBuffer, initialFetchSize: number = 1 << 19 /* 512kb */): Promise @@ -46,10 +46,18 @@ export async function parquetMetadataAsync(asyncBuffer: AsyncBuffer, initialFetc * Read parquet metadata from a buffer * * @param {ArrayBuffer} arrayBuffer parquet file contents - * @returns {FileMetaData} metadata object + * @returns {FileMetaData} parquet metadata object */ export function parquetMetadata(arrayBuffer: ArrayBuffer): FileMetaData +/** + * Return a tree of schema elements from parquet metadata. + * + * @param {FileMetaData} metadata parquet metadata object + * @returns {SchemaTree} tree of schema elements + */ +export function parquetSchema(metadata: SchemaElement[]): SchemaTree + /** * Decompress snappy data. * Accepts an output buffer to avoid allocating a new buffer for each call. diff --git a/src/hyparquet.js b/src/hyparquet.js index 5f826d2..b912757 100644 --- a/src/hyparquet.js +++ b/src/hyparquet.js @@ -1,5 +1,5 @@ -import { parquetMetadata, parquetMetadataAsync } from './metadata.js' -export { parquetMetadata, parquetMetadataAsync } +import { parquetMetadata, parquetMetadataAsync, parquetSchema } from './metadata.js' +export { parquetMetadata, parquetMetadataAsync, parquetSchema } import { parquetRead } from './read.js' export { parquetRead } diff --git a/src/metadata.js b/src/metadata.js index 336791b..fc7ea3a 100644 --- a/src/metadata.js +++ b/src/metadata.js @@ -1,3 +1,4 @@ +import { schemaTree } from './schema.js' import { deserializeTCompactProtocol } from './thrift.js' /** @@ -17,7 +18,7 @@ import { deserializeTCompactProtocol } from './thrift.js' * @typedef {import("./types.d.ts").FileMetaData} FileMetaData * @param {AsyncBuffer} asyncBuffer parquet file contents * @param {number} initialFetchSize initial fetch size in bytes - * @returns {Promise} metadata object + * @returns {Promise} parquet metadata object */ export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 << 19 /* 512kb */) { // fetch last bytes (footer) of the file @@ -46,7 +47,7 @@ export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 << * Read parquet metadata from a buffer * * @param {ArrayBuffer} arrayBuffer parquet file contents - * @returns {FileMetaData} metadata object + * @returns {FileMetaData} parquet metadata object */ export function parquetMetadata(arrayBuffer) { // DataView for easier manipulation of the buffer @@ -140,3 +141,14 @@ export function parquetMetadata(arrayBuffer) { metadata_length: metadataLength, } } + +/** + * Return a tree of schema elements from parquet metadata. + * + * @typedef {import("./types.d.ts").SchemaTree} SchemaTree + * @param {FileMetaData} metadata parquet metadata object + * @returns {SchemaTree} tree of schema elements + */ +export function parquetSchema(metadata) { + return schemaTree(metadata.schema, 0) +} diff --git a/src/schema.js b/src/schema.js index 9a45fcd..8be3941 100644 --- a/src/schema.js +++ b/src/schema.js @@ -2,31 +2,31 @@ import { FieldRepetitionType } from './constants.js' /** * @typedef {import('./types.js').SchemaElement} SchemaElement - * @typedef {{ element: SchemaElement, children: SchemaTree[], endIndex: number }} SchemaTree + * @typedef {import('./types.js').SchemaTree} SchemaTree */ /** * Build a tree from the schema elements. * * @param {SchemaElement[]} schema - * @param {number} i index of the root element + * @param {number} rootIndex index of the root element * @returns {SchemaTree} tree of schema elements */ -function schemaTree(schema, i) { - const root = schema[i] +export function schemaTree(schema, rootIndex) { + const root = schema[rootIndex] const children = [] - i++ + let count = 1 // Read the specified number of children if (root.num_children) { while (children.length < root.num_children) { - const child = schemaTree(schema, i) - i = child.endIndex + const child = schemaTree(schema, rootIndex + count) + count += child.count children.push(child) } } - return { endIndex: i, element: root, children } + return { count, element: root, children } } /** diff --git a/src/types.d.ts b/src/types.d.ts index d9c5236..394b14f 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -34,6 +34,12 @@ export interface FileMetaData { metadata_length: number } +export interface SchemaTree { + element: SchemaElement + children: SchemaTree[] + count: number +} + export interface SchemaElement { type?: ParquetType type_length?: number diff --git a/test/metadata.test.js b/test/metadata.test.js index 040c73e..427961d 100644 --- a/test/metadata.test.js +++ b/test/metadata.test.js @@ -1,6 +1,6 @@ import fs from 'fs' import { describe, expect, it } from 'vitest' -import { parquetMetadata, parquetMetadataAsync } from '../src/hyparquet.js' +import { parquetMetadata, parquetMetadataAsync, parquetSchema } from '../src/hyparquet.js' import { toJson } from '../src/toJson.js' /** @@ -77,6 +77,59 @@ describe('parquetMetadataAsync', () => { }) }) +describe('parquetSchema', () => { + it('should parse schema from addrtype-missing-value.parquet', async () => { + const arrayBuffer = await readFileToArrayBuffer('test/files/addrtype-missing-value.parquet') + const metadata = parquetMetadata(arrayBuffer) + const result = parquetSchema(metadata) + expect(toJson(result)).toEqual({ + children: [ + { + children: [], + count: 1, + element: { + converted_type: 0, + name: 'ADDRTYPE', + repetition_type: 1, + type: 6, + }, + }, + ], + count: 2, + element: { + name: 'duckdb_schema', + num_children: 1, + repetition_type: 0, + }, + }) + }) + + it('should parse schema from rowgroups.parquet', async () => { + const arrayBuffer = await readFileToArrayBuffer('test/files/rowgroups.parquet') + const metadata = parquetMetadata(arrayBuffer) + const result = parquetSchema(metadata) + expect(toJson(result)).toEqual({ + children: [ + { + children: [], + count: 1, + element: { + name: 'numbers', + repetition_type: 1, + type: 2, + }, + }, + ], + count: 2, + element: { + name: 'schema', + num_children: 1, + repetition_type: 0, + }, + }) + }) +}) + // Parquet v1 from DuckDB const addrtypeMetadata = { version: 1,