Export parquetSchema tree

This commit is contained in:
Kenny Daniel 2024-01-19 18:51:16 -08:00
parent 77f0354599
commit 8d7b145439
No known key found for this signature in database
GPG Key ID: 6A3C5E318BE71391
7 changed files with 98 additions and 19 deletions

@ -27,14 +27,14 @@
"typecheck": "tsc"
},
"devDependencies": {
"@types/node": "20.11.3",
"@types/node": "20.11.5",
"@typescript-eslint/eslint-plugin": "6.19.0",
"@vitest/coverage-v8": "1.2.0",
"@vitest/coverage-v8": "1.2.1",
"eslint": "8.56.0",
"eslint-plugin-import": "2.29.1",
"eslint-plugin-jsdoc": "48.0.2",
"http-server": "14.1.1",
"typescript": "5.3.3",
"vitest": "1.2.0"
"vitest": "1.2.1"
}
}

14
src/hyparquet.d.ts vendored

@ -1,4 +1,4 @@
export { AsyncBuffer, FileMetaData } from './types'
export { AsyncBuffer, FileMetaData, SchemaTree } from './types'
/**
* Read parquet data rows from a file-like object.
@ -38,7 +38,7 @@ export async function parquetRead(options: ParquetReadOptions): Promise<void>
*
* @param {AsyncBuffer} asyncBuffer parquet file contents
* @param {number} initialFetchSize initial fetch size in bytes (default 512kb)
* @returns {Promise<FileMetaData>} metadata object
* @returns {Promise<FileMetaData>} parquet metadata object
*/
export async function parquetMetadataAsync(asyncBuffer: AsyncBuffer, initialFetchSize: number = 1 << 19 /* 512kb */): Promise<FileMetaData>
@ -46,10 +46,18 @@ export async function parquetMetadataAsync(asyncBuffer: AsyncBuffer, initialFetc
* Read parquet metadata from a buffer
*
* @param {ArrayBuffer} arrayBuffer parquet file contents
* @returns {FileMetaData} metadata object
* @returns {FileMetaData} parquet metadata object
*/
export function parquetMetadata(arrayBuffer: ArrayBuffer): FileMetaData
/**
* Return a tree of schema elements from parquet metadata.
*
* @param {FileMetaData} metadata parquet metadata object
* @returns {SchemaTree} tree of schema elements
*/
export function parquetSchema(metadata: SchemaElement[]): SchemaTree
/**
* Decompress snappy data.
* Accepts an output buffer to avoid allocating a new buffer for each call.

@ -1,5 +1,5 @@
import { parquetMetadata, parquetMetadataAsync } from './metadata.js'
export { parquetMetadata, parquetMetadataAsync }
import { parquetMetadata, parquetMetadataAsync, parquetSchema } from './metadata.js'
export { parquetMetadata, parquetMetadataAsync, parquetSchema }
import { parquetRead } from './read.js'
export { parquetRead }

@ -1,3 +1,4 @@
import { schemaTree } from './schema.js'
import { deserializeTCompactProtocol } from './thrift.js'
/**
@ -17,7 +18,7 @@ import { deserializeTCompactProtocol } from './thrift.js'
* @typedef {import("./types.d.ts").FileMetaData} FileMetaData
* @param {AsyncBuffer} asyncBuffer parquet file contents
* @param {number} initialFetchSize initial fetch size in bytes
* @returns {Promise<FileMetaData>} metadata object
* @returns {Promise<FileMetaData>} parquet metadata object
*/
export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 << 19 /* 512kb */) {
// fetch last bytes (footer) of the file
@ -46,7 +47,7 @@ export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 <<
* Read parquet metadata from a buffer
*
* @param {ArrayBuffer} arrayBuffer parquet file contents
* @returns {FileMetaData} metadata object
* @returns {FileMetaData} parquet metadata object
*/
export function parquetMetadata(arrayBuffer) {
// DataView for easier manipulation of the buffer
@ -140,3 +141,14 @@ export function parquetMetadata(arrayBuffer) {
metadata_length: metadataLength,
}
}
/**
* Return a tree of schema elements from parquet metadata.
*
* @typedef {import("./types.d.ts").SchemaTree} SchemaTree
* @param {FileMetaData} metadata parquet metadata object
* @returns {SchemaTree} tree of schema elements
*/
export function parquetSchema(metadata) {
return schemaTree(metadata.schema, 0)
}

@ -2,31 +2,31 @@ import { FieldRepetitionType } from './constants.js'
/**
* @typedef {import('./types.js').SchemaElement} SchemaElement
* @typedef {{ element: SchemaElement, children: SchemaTree[], endIndex: number }} SchemaTree
* @typedef {import('./types.js').SchemaTree} SchemaTree
*/
/**
* Build a tree from the schema elements.
*
* @param {SchemaElement[]} schema
* @param {number} i index of the root element
* @param {number} rootIndex index of the root element
* @returns {SchemaTree} tree of schema elements
*/
function schemaTree(schema, i) {
const root = schema[i]
export function schemaTree(schema, rootIndex) {
const root = schema[rootIndex]
const children = []
i++
let count = 1
// Read the specified number of children
if (root.num_children) {
while (children.length < root.num_children) {
const child = schemaTree(schema, i)
i = child.endIndex
const child = schemaTree(schema, rootIndex + count)
count += child.count
children.push(child)
}
}
return { endIndex: i, element: root, children }
return { count, element: root, children }
}
/**

6
src/types.d.ts vendored

@ -34,6 +34,12 @@ export interface FileMetaData {
metadata_length: number
}
export interface SchemaTree {
element: SchemaElement
children: SchemaTree[]
count: number
}
export interface SchemaElement {
type?: ParquetType
type_length?: number

@ -1,6 +1,6 @@
import fs from 'fs'
import { describe, expect, it } from 'vitest'
import { parquetMetadata, parquetMetadataAsync } from '../src/hyparquet.js'
import { parquetMetadata, parquetMetadataAsync, parquetSchema } from '../src/hyparquet.js'
import { toJson } from '../src/toJson.js'
/**
@ -77,6 +77,59 @@ describe('parquetMetadataAsync', () => {
})
})
describe('parquetSchema', () => {
it('should parse schema from addrtype-missing-value.parquet', async () => {
const arrayBuffer = await readFileToArrayBuffer('test/files/addrtype-missing-value.parquet')
const metadata = parquetMetadata(arrayBuffer)
const result = parquetSchema(metadata)
expect(toJson(result)).toEqual({
children: [
{
children: [],
count: 1,
element: {
converted_type: 0,
name: 'ADDRTYPE',
repetition_type: 1,
type: 6,
},
},
],
count: 2,
element: {
name: 'duckdb_schema',
num_children: 1,
repetition_type: 0,
},
})
})
it('should parse schema from rowgroups.parquet', async () => {
const arrayBuffer = await readFileToArrayBuffer('test/files/rowgroups.parquet')
const metadata = parquetMetadata(arrayBuffer)
const result = parquetSchema(metadata)
expect(toJson(result)).toEqual({
children: [
{
children: [],
count: 1,
element: {
name: 'numbers',
repetition_type: 1,
type: 2,
},
},
],
count: 2,
element: {
name: 'schema',
num_children: 1,
repetition_type: 0,
},
})
})
})
// Parquet v1 from DuckDB
const addrtypeMetadata = {
version: 1,