mirror of
https://github.com/asadbek064/hyparquet.git
synced 2026-01-04 02:36:36 +00:00
Add path to schemaTree
This commit is contained in:
parent
892c933a05
commit
12dc5a47f8
@ -27,7 +27,7 @@
|
||||
"typecheck": "tsc"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "20.12.8",
|
||||
"@types/node": "20.12.10",
|
||||
"@typescript-eslint/eslint-plugin": "7.8.0",
|
||||
"@vitest/coverage-v8": "1.6.0",
|
||||
"eslint": "8.57.0",
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
|
||||
import { getColumnOffset, readColumn } from './column.js'
|
||||
import { parquetMetadataAsync } from './metadata.js'
|
||||
import { getColumnName, getSchemaPath, isMapLike } from './schema.js'
|
||||
import { getSchemaPath, isMapLike } from './schema.js'
|
||||
import { concat } from './utils.js'
|
||||
|
||||
/**
|
||||
@ -89,7 +89,7 @@ async function readRowGroup(options, rowGroup, groupStart) {
|
||||
rowGroup.columns.forEach(({ meta_data: columnMetadata }) => {
|
||||
if (!columnMetadata) throw new Error('parquet column metadata is undefined')
|
||||
// skip columns that are not requested
|
||||
if (columns && !columns.includes(getColumnName(columnMetadata.path_in_schema))) return
|
||||
if (columns && !columns.includes(columnMetadata.path_in_schema[0])) return
|
||||
|
||||
const startByte = getColumnOffset(columnMetadata)
|
||||
const endByte = startByte + Number(columnMetadata.total_compressed_size)
|
||||
@ -119,7 +119,7 @@ async function readRowGroup(options, rowGroup, groupStart) {
|
||||
if (!columnMetadata) throw new Error('parquet column metadata is undefined')
|
||||
|
||||
// skip columns that are not requested
|
||||
const columnName = getColumnName(columnMetadata.path_in_schema)
|
||||
const columnName = columnMetadata.path_in_schema[0]
|
||||
if (columns && !columns.includes(columnName)) continue
|
||||
|
||||
const columnStartByte = getColumnOffset(columnMetadata)
|
||||
|
||||
@ -1,30 +1,29 @@
|
||||
/**
|
||||
* @typedef {import('./types.js').SchemaElement} SchemaElement
|
||||
* @typedef {import('./types.js').SchemaTree} SchemaTree
|
||||
*/
|
||||
|
||||
/**
|
||||
* Build a tree from the schema elements.
|
||||
*
|
||||
* @typedef {import('./types.js').SchemaElement} SchemaElement
|
||||
* @typedef {import('./types.js').SchemaTree} SchemaTree
|
||||
* @param {SchemaElement[]} schema
|
||||
* @param {number} rootIndex index of the root element
|
||||
* @param {string[]} path path to the element
|
||||
* @returns {SchemaTree} tree of schema elements
|
||||
*/
|
||||
function schemaTree(schema, rootIndex) {
|
||||
const root = schema[rootIndex]
|
||||
function schemaTree(schema, rootIndex, path) {
|
||||
const element = schema[rootIndex]
|
||||
const children = []
|
||||
let count = 1
|
||||
|
||||
// Read the specified number of children
|
||||
if (root.num_children) {
|
||||
while (children.length < root.num_children) {
|
||||
const child = schemaTree(schema, rootIndex + count)
|
||||
if (element.num_children) {
|
||||
while (children.length < element.num_children) {
|
||||
const childElement = schema[rootIndex + count]
|
||||
const child = schemaTree(schema, rootIndex + count, [...path, childElement.name])
|
||||
count += child.count
|
||||
children.push(child)
|
||||
}
|
||||
}
|
||||
|
||||
return { count, element: root, children }
|
||||
return { count, element, children, path }
|
||||
}
|
||||
|
||||
/**
|
||||
@ -35,7 +34,7 @@ function schemaTree(schema, rootIndex) {
|
||||
* @returns {SchemaTree[]} list of schema elements
|
||||
*/
|
||||
export function getSchemaPath(schema, name) {
|
||||
let tree = schemaTree(schema, 0)
|
||||
let tree = schemaTree(schema, 0, [])
|
||||
const path = [tree]
|
||||
for (const part of name) {
|
||||
const child = tree.children.find(child => child.element.name === part)
|
||||
@ -93,19 +92,6 @@ export function getMaxDefinitionLevel(schemaPath) {
|
||||
return maxLevel
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the column name as foo.bar and handle list and map like columns.
|
||||
*
|
||||
* @param {string[]} path
|
||||
* @returns {string} column name
|
||||
*/
|
||||
export function getColumnName(path) {
|
||||
return path.join('.')
|
||||
.replace(/(\.list\.element)+/g, '')
|
||||
.replace(/\.key_value\.key/g, '')
|
||||
.replace(/\.key_value\.value/g, '')
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a column is list-like.
|
||||
*
|
||||
|
||||
3
src/types.d.ts
vendored
3
src/types.d.ts
vendored
@ -28,9 +28,10 @@ export interface FileMetaData {
|
||||
}
|
||||
|
||||
export interface SchemaTree {
|
||||
element: SchemaElement
|
||||
children: SchemaTree[]
|
||||
count: number
|
||||
element: SchemaElement
|
||||
path: string[]
|
||||
}
|
||||
|
||||
export interface SchemaElement {
|
||||
|
||||
@ -32,6 +32,7 @@ describe('Parquet schema utils', () => {
|
||||
children: [],
|
||||
count: 1,
|
||||
element: { name: 'child1', repetition_type: 'OPTIONAL' },
|
||||
path: ['child1'],
|
||||
})
|
||||
})
|
||||
|
||||
|
||||
@ -30,6 +30,7 @@ const addrtypeSchema = {
|
||||
repetition_type: 'OPTIONAL',
|
||||
type: 'BYTE_ARRAY',
|
||||
},
|
||||
path: ['ADDRTYPE'],
|
||||
},
|
||||
],
|
||||
count: 2,
|
||||
@ -38,6 +39,7 @@ const addrtypeSchema = {
|
||||
num_children: 1,
|
||||
repetition_type: 'REQUIRED',
|
||||
},
|
||||
path: [],
|
||||
}
|
||||
|
||||
// Parquet v2 from pandas with 2 row groups
|
||||
@ -51,6 +53,7 @@ const rowgroupsSchema = {
|
||||
repetition_type: 'OPTIONAL',
|
||||
type: 'INT64',
|
||||
},
|
||||
path: ['numbers'],
|
||||
},
|
||||
],
|
||||
count: 2,
|
||||
@ -59,4 +62,5 @@ const rowgroupsSchema = {
|
||||
num_children: 1,
|
||||
repetition_type: 'REQUIRED',
|
||||
},
|
||||
path: [],
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user