diff --git a/package.json b/package.json index 247af95..46ca798 100644 --- a/package.json +++ b/package.json @@ -27,7 +27,7 @@ "typecheck": "tsc" }, "devDependencies": { - "@types/node": "20.12.8", + "@types/node": "20.12.10", "@typescript-eslint/eslint-plugin": "7.8.0", "@vitest/coverage-v8": "1.6.0", "eslint": "8.57.0", diff --git a/src/read.js b/src/read.js index d4b3b99..df3ae24 100644 --- a/src/read.js +++ b/src/read.js @@ -1,7 +1,7 @@ import { getColumnOffset, readColumn } from './column.js' import { parquetMetadataAsync } from './metadata.js' -import { getColumnName, getSchemaPath, isMapLike } from './schema.js' +import { getSchemaPath, isMapLike } from './schema.js' import { concat } from './utils.js' /** @@ -89,7 +89,7 @@ async function readRowGroup(options, rowGroup, groupStart) { rowGroup.columns.forEach(({ meta_data: columnMetadata }) => { if (!columnMetadata) throw new Error('parquet column metadata is undefined') // skip columns that are not requested - if (columns && !columns.includes(getColumnName(columnMetadata.path_in_schema))) return + if (columns && !columns.includes(columnMetadata.path_in_schema[0])) return const startByte = getColumnOffset(columnMetadata) const endByte = startByte + Number(columnMetadata.total_compressed_size) @@ -119,7 +119,7 @@ async function readRowGroup(options, rowGroup, groupStart) { if (!columnMetadata) throw new Error('parquet column metadata is undefined') // skip columns that are not requested - const columnName = getColumnName(columnMetadata.path_in_schema) + const columnName = columnMetadata.path_in_schema[0] if (columns && !columns.includes(columnName)) continue const columnStartByte = getColumnOffset(columnMetadata) diff --git a/src/schema.js b/src/schema.js index ba62673..6b85cde 100644 --- a/src/schema.js +++ b/src/schema.js @@ -1,30 +1,29 @@ -/** - * @typedef {import('./types.js').SchemaElement} SchemaElement - * @typedef {import('./types.js').SchemaTree} SchemaTree - */ - /** * Build a tree from the schema elements. * + * @typedef {import('./types.js').SchemaElement} SchemaElement + * @typedef {import('./types.js').SchemaTree} SchemaTree * @param {SchemaElement[]} schema * @param {number} rootIndex index of the root element + * @param {string[]} path path to the element * @returns {SchemaTree} tree of schema elements */ -function schemaTree(schema, rootIndex) { - const root = schema[rootIndex] +function schemaTree(schema, rootIndex, path) { + const element = schema[rootIndex] const children = [] let count = 1 // Read the specified number of children - if (root.num_children) { - while (children.length < root.num_children) { - const child = schemaTree(schema, rootIndex + count) + if (element.num_children) { + while (children.length < element.num_children) { + const childElement = schema[rootIndex + count] + const child = schemaTree(schema, rootIndex + count, [...path, childElement.name]) count += child.count children.push(child) } } - return { count, element: root, children } + return { count, element, children, path } } /** @@ -35,7 +34,7 @@ function schemaTree(schema, rootIndex) { * @returns {SchemaTree[]} list of schema elements */ export function getSchemaPath(schema, name) { - let tree = schemaTree(schema, 0) + let tree = schemaTree(schema, 0, []) const path = [tree] for (const part of name) { const child = tree.children.find(child => child.element.name === part) @@ -93,19 +92,6 @@ export function getMaxDefinitionLevel(schemaPath) { return maxLevel } -/** - * Get the column name as foo.bar and handle list and map like columns. - * - * @param {string[]} path - * @returns {string} column name - */ -export function getColumnName(path) { - return path.join('.') - .replace(/(\.list\.element)+/g, '') - .replace(/\.key_value\.key/g, '') - .replace(/\.key_value\.value/g, '') -} - /** * Check if a column is list-like. * diff --git a/src/types.d.ts b/src/types.d.ts index 6d2793f..579be2d 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -28,9 +28,10 @@ export interface FileMetaData { } export interface SchemaTree { - element: SchemaElement children: SchemaTree[] count: number + element: SchemaElement + path: string[] } export interface SchemaElement { diff --git a/test/schema.test.js b/test/schema.test.js index 55fab3d..38534ef 100644 --- a/test/schema.test.js +++ b/test/schema.test.js @@ -32,6 +32,7 @@ describe('Parquet schema utils', () => { children: [], count: 1, element: { name: 'child1', repetition_type: 'OPTIONAL' }, + path: ['child1'], }) }) diff --git a/test/schemaTree.test.js b/test/schemaTree.test.js index 9c1f8d9..487b90e 100644 --- a/test/schemaTree.test.js +++ b/test/schemaTree.test.js @@ -30,6 +30,7 @@ const addrtypeSchema = { repetition_type: 'OPTIONAL', type: 'BYTE_ARRAY', }, + path: ['ADDRTYPE'], }, ], count: 2, @@ -38,6 +39,7 @@ const addrtypeSchema = { num_children: 1, repetition_type: 'REQUIRED', }, + path: [], } // Parquet v2 from pandas with 2 row groups @@ -51,6 +53,7 @@ const rowgroupsSchema = { repetition_type: 'OPTIONAL', type: 'INT64', }, + path: ['numbers'], }, ], count: 2, @@ -59,4 +62,5 @@ const rowgroupsSchema = { num_children: 1, repetition_type: 'REQUIRED', }, + path: [], }