mirror of
https://github.com/asadbek064/hyparquet.git
synced 2026-02-23 21:01:32 +00:00
Export parquetSchema tree
This commit is contained in:
parent
77f0354599
commit
8d7b145439
@ -27,14 +27,14 @@
|
||||
"typecheck": "tsc"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "20.11.3",
|
||||
"@types/node": "20.11.5",
|
||||
"@typescript-eslint/eslint-plugin": "6.19.0",
|
||||
"@vitest/coverage-v8": "1.2.0",
|
||||
"@vitest/coverage-v8": "1.2.1",
|
||||
"eslint": "8.56.0",
|
||||
"eslint-plugin-import": "2.29.1",
|
||||
"eslint-plugin-jsdoc": "48.0.2",
|
||||
"http-server": "14.1.1",
|
||||
"typescript": "5.3.3",
|
||||
"vitest": "1.2.0"
|
||||
"vitest": "1.2.1"
|
||||
}
|
||||
}
|
||||
|
||||
14
src/hyparquet.d.ts
vendored
14
src/hyparquet.d.ts
vendored
@ -1,4 +1,4 @@
|
||||
export { AsyncBuffer, FileMetaData } from './types'
|
||||
export { AsyncBuffer, FileMetaData, SchemaTree } from './types'
|
||||
|
||||
/**
|
||||
* Read parquet data rows from a file-like object.
|
||||
@ -38,7 +38,7 @@ export async function parquetRead(options: ParquetReadOptions): Promise<void>
|
||||
*
|
||||
* @param {AsyncBuffer} asyncBuffer parquet file contents
|
||||
* @param {number} initialFetchSize initial fetch size in bytes (default 512kb)
|
||||
* @returns {Promise<FileMetaData>} metadata object
|
||||
* @returns {Promise<FileMetaData>} parquet metadata object
|
||||
*/
|
||||
export async function parquetMetadataAsync(asyncBuffer: AsyncBuffer, initialFetchSize: number = 1 << 19 /* 512kb */): Promise<FileMetaData>
|
||||
|
||||
@ -46,10 +46,18 @@ export async function parquetMetadataAsync(asyncBuffer: AsyncBuffer, initialFetc
|
||||
* Read parquet metadata from a buffer
|
||||
*
|
||||
* @param {ArrayBuffer} arrayBuffer parquet file contents
|
||||
* @returns {FileMetaData} metadata object
|
||||
* @returns {FileMetaData} parquet metadata object
|
||||
*/
|
||||
export function parquetMetadata(arrayBuffer: ArrayBuffer): FileMetaData
|
||||
|
||||
/**
|
||||
* Return a tree of schema elements from parquet metadata.
|
||||
*
|
||||
* @param {FileMetaData} metadata parquet metadata object
|
||||
* @returns {SchemaTree} tree of schema elements
|
||||
*/
|
||||
export function parquetSchema(metadata: SchemaElement[]): SchemaTree
|
||||
|
||||
/**
|
||||
* Decompress snappy data.
|
||||
* Accepts an output buffer to avoid allocating a new buffer for each call.
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
import { parquetMetadata, parquetMetadataAsync } from './metadata.js'
|
||||
export { parquetMetadata, parquetMetadataAsync }
|
||||
import { parquetMetadata, parquetMetadataAsync, parquetSchema } from './metadata.js'
|
||||
export { parquetMetadata, parquetMetadataAsync, parquetSchema }
|
||||
|
||||
import { parquetRead } from './read.js'
|
||||
export { parquetRead }
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
import { schemaTree } from './schema.js'
|
||||
import { deserializeTCompactProtocol } from './thrift.js'
|
||||
|
||||
/**
|
||||
@ -17,7 +18,7 @@ import { deserializeTCompactProtocol } from './thrift.js'
|
||||
* @typedef {import("./types.d.ts").FileMetaData} FileMetaData
|
||||
* @param {AsyncBuffer} asyncBuffer parquet file contents
|
||||
* @param {number} initialFetchSize initial fetch size in bytes
|
||||
* @returns {Promise<FileMetaData>} metadata object
|
||||
* @returns {Promise<FileMetaData>} parquet metadata object
|
||||
*/
|
||||
export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 << 19 /* 512kb */) {
|
||||
// fetch last bytes (footer) of the file
|
||||
@ -46,7 +47,7 @@ export async function parquetMetadataAsync(asyncBuffer, initialFetchSize = 1 <<
|
||||
* Read parquet metadata from a buffer
|
||||
*
|
||||
* @param {ArrayBuffer} arrayBuffer parquet file contents
|
||||
* @returns {FileMetaData} metadata object
|
||||
* @returns {FileMetaData} parquet metadata object
|
||||
*/
|
||||
export function parquetMetadata(arrayBuffer) {
|
||||
// DataView for easier manipulation of the buffer
|
||||
@ -140,3 +141,14 @@ export function parquetMetadata(arrayBuffer) {
|
||||
metadata_length: metadataLength,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a tree of schema elements from parquet metadata.
|
||||
*
|
||||
* @typedef {import("./types.d.ts").SchemaTree} SchemaTree
|
||||
* @param {FileMetaData} metadata parquet metadata object
|
||||
* @returns {SchemaTree} tree of schema elements
|
||||
*/
|
||||
export function parquetSchema(metadata) {
|
||||
return schemaTree(metadata.schema, 0)
|
||||
}
|
||||
|
||||
@ -2,31 +2,31 @@ import { FieldRepetitionType } from './constants.js'
|
||||
|
||||
/**
|
||||
* @typedef {import('./types.js').SchemaElement} SchemaElement
|
||||
* @typedef {{ element: SchemaElement, children: SchemaTree[], endIndex: number }} SchemaTree
|
||||
* @typedef {import('./types.js').SchemaTree} SchemaTree
|
||||
*/
|
||||
|
||||
/**
|
||||
* Build a tree from the schema elements.
|
||||
*
|
||||
* @param {SchemaElement[]} schema
|
||||
* @param {number} i index of the root element
|
||||
* @param {number} rootIndex index of the root element
|
||||
* @returns {SchemaTree} tree of schema elements
|
||||
*/
|
||||
function schemaTree(schema, i) {
|
||||
const root = schema[i]
|
||||
export function schemaTree(schema, rootIndex) {
|
||||
const root = schema[rootIndex]
|
||||
const children = []
|
||||
i++
|
||||
let count = 1
|
||||
|
||||
// Read the specified number of children
|
||||
if (root.num_children) {
|
||||
while (children.length < root.num_children) {
|
||||
const child = schemaTree(schema, i)
|
||||
i = child.endIndex
|
||||
const child = schemaTree(schema, rootIndex + count)
|
||||
count += child.count
|
||||
children.push(child)
|
||||
}
|
||||
}
|
||||
|
||||
return { endIndex: i, element: root, children }
|
||||
return { count, element: root, children }
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
6
src/types.d.ts
vendored
6
src/types.d.ts
vendored
@ -34,6 +34,12 @@ export interface FileMetaData {
|
||||
metadata_length: number
|
||||
}
|
||||
|
||||
export interface SchemaTree {
|
||||
element: SchemaElement
|
||||
children: SchemaTree[]
|
||||
count: number
|
||||
}
|
||||
|
||||
export interface SchemaElement {
|
||||
type?: ParquetType
|
||||
type_length?: number
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
import fs from 'fs'
|
||||
import { describe, expect, it } from 'vitest'
|
||||
import { parquetMetadata, parquetMetadataAsync } from '../src/hyparquet.js'
|
||||
import { parquetMetadata, parquetMetadataAsync, parquetSchema } from '../src/hyparquet.js'
|
||||
import { toJson } from '../src/toJson.js'
|
||||
|
||||
/**
|
||||
@ -77,6 +77,59 @@ describe('parquetMetadataAsync', () => {
|
||||
})
|
||||
})
|
||||
|
||||
describe('parquetSchema', () => {
|
||||
it('should parse schema from addrtype-missing-value.parquet', async () => {
|
||||
const arrayBuffer = await readFileToArrayBuffer('test/files/addrtype-missing-value.parquet')
|
||||
const metadata = parquetMetadata(arrayBuffer)
|
||||
const result = parquetSchema(metadata)
|
||||
expect(toJson(result)).toEqual({
|
||||
children: [
|
||||
{
|
||||
children: [],
|
||||
count: 1,
|
||||
element: {
|
||||
converted_type: 0,
|
||||
name: 'ADDRTYPE',
|
||||
repetition_type: 1,
|
||||
type: 6,
|
||||
},
|
||||
},
|
||||
],
|
||||
count: 2,
|
||||
element: {
|
||||
name: 'duckdb_schema',
|
||||
num_children: 1,
|
||||
repetition_type: 0,
|
||||
},
|
||||
})
|
||||
})
|
||||
|
||||
it('should parse schema from rowgroups.parquet', async () => {
|
||||
const arrayBuffer = await readFileToArrayBuffer('test/files/rowgroups.parquet')
|
||||
const metadata = parquetMetadata(arrayBuffer)
|
||||
const result = parquetSchema(metadata)
|
||||
expect(toJson(result)).toEqual({
|
||||
children: [
|
||||
{
|
||||
children: [],
|
||||
count: 1,
|
||||
element: {
|
||||
name: 'numbers',
|
||||
repetition_type: 1,
|
||||
type: 2,
|
||||
},
|
||||
},
|
||||
],
|
||||
count: 2,
|
||||
element: {
|
||||
name: 'schema',
|
||||
num_children: 1,
|
||||
repetition_type: 0,
|
||||
},
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
// Parquet v1 from DuckDB
|
||||
const addrtypeMetadata = {
|
||||
version: 1,
|
||||
|
||||
Loading…
Reference in New Issue
Block a user