Add path to schemaTree

This commit is contained in:
Kenny Daniel 2024-05-06 13:18:27 -07:00
parent 892c933a05
commit 12dc5a47f8
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
6 changed files with 22 additions and 30 deletions

@ -27,7 +27,7 @@
"typecheck": "tsc"
},
"devDependencies": {
"@types/node": "20.12.8",
"@types/node": "20.12.10",
"@typescript-eslint/eslint-plugin": "7.8.0",
"@vitest/coverage-v8": "1.6.0",
"eslint": "8.57.0",

@ -1,7 +1,7 @@
import { getColumnOffset, readColumn } from './column.js'
import { parquetMetadataAsync } from './metadata.js'
import { getColumnName, getSchemaPath, isMapLike } from './schema.js'
import { getSchemaPath, isMapLike } from './schema.js'
import { concat } from './utils.js'
/**
@ -89,7 +89,7 @@ async function readRowGroup(options, rowGroup, groupStart) {
rowGroup.columns.forEach(({ meta_data: columnMetadata }) => {
if (!columnMetadata) throw new Error('parquet column metadata is undefined')
// skip columns that are not requested
if (columns && !columns.includes(getColumnName(columnMetadata.path_in_schema))) return
if (columns && !columns.includes(columnMetadata.path_in_schema[0])) return
const startByte = getColumnOffset(columnMetadata)
const endByte = startByte + Number(columnMetadata.total_compressed_size)
@ -119,7 +119,7 @@ async function readRowGroup(options, rowGroup, groupStart) {
if (!columnMetadata) throw new Error('parquet column metadata is undefined')
// skip columns that are not requested
const columnName = getColumnName(columnMetadata.path_in_schema)
const columnName = columnMetadata.path_in_schema[0]
if (columns && !columns.includes(columnName)) continue
const columnStartByte = getColumnOffset(columnMetadata)

@ -1,30 +1,29 @@
/**
* @typedef {import('./types.js').SchemaElement} SchemaElement
* @typedef {import('./types.js').SchemaTree} SchemaTree
*/
/**
* Build a tree from the schema elements.
*
* @typedef {import('./types.js').SchemaElement} SchemaElement
* @typedef {import('./types.js').SchemaTree} SchemaTree
* @param {SchemaElement[]} schema
* @param {number} rootIndex index of the root element
* @param {string[]} path path to the element
* @returns {SchemaTree} tree of schema elements
*/
function schemaTree(schema, rootIndex) {
const root = schema[rootIndex]
function schemaTree(schema, rootIndex, path) {
const element = schema[rootIndex]
const children = []
let count = 1
// Read the specified number of children
if (root.num_children) {
while (children.length < root.num_children) {
const child = schemaTree(schema, rootIndex + count)
if (element.num_children) {
while (children.length < element.num_children) {
const childElement = schema[rootIndex + count]
const child = schemaTree(schema, rootIndex + count, [...path, childElement.name])
count += child.count
children.push(child)
}
}
return { count, element: root, children }
return { count, element, children, path }
}
/**
@ -35,7 +34,7 @@ function schemaTree(schema, rootIndex) {
* @returns {SchemaTree[]} list of schema elements
*/
export function getSchemaPath(schema, name) {
let tree = schemaTree(schema, 0)
let tree = schemaTree(schema, 0, [])
const path = [tree]
for (const part of name) {
const child = tree.children.find(child => child.element.name === part)
@ -93,19 +92,6 @@ export function getMaxDefinitionLevel(schemaPath) {
return maxLevel
}
/**
* Get the column name as foo.bar and handle list and map like columns.
*
* @param {string[]} path
* @returns {string} column name
*/
export function getColumnName(path) {
return path.join('.')
.replace(/(\.list\.element)+/g, '')
.replace(/\.key_value\.key/g, '')
.replace(/\.key_value\.value/g, '')
}
/**
* Check if a column is list-like.
*

3
src/types.d.ts vendored

@ -28,9 +28,10 @@ export interface FileMetaData {
}
export interface SchemaTree {
element: SchemaElement
children: SchemaTree[]
count: number
element: SchemaElement
path: string[]
}
export interface SchemaElement {

@ -32,6 +32,7 @@ describe('Parquet schema utils', () => {
children: [],
count: 1,
element: { name: 'child1', repetition_type: 'OPTIONAL' },
path: ['child1'],
})
})

@ -30,6 +30,7 @@ const addrtypeSchema = {
repetition_type: 'OPTIONAL',
type: 'BYTE_ARRAY',
},
path: ['ADDRTYPE'],
},
],
count: 2,
@ -38,6 +39,7 @@ const addrtypeSchema = {
num_children: 1,
repetition_type: 'REQUIRED',
},
path: [],
}
// Parquet v2 from pandas with 2 row groups
@ -51,6 +53,7 @@ const rowgroupsSchema = {
repetition_type: 'OPTIONAL',
type: 'INT64',
},
path: ['numbers'],
},
],
count: 2,
@ -59,4 +62,5 @@ const rowgroupsSchema = {
num_children: 1,
repetition_type: 'REQUIRED',
},
path: [],
}