From c6ad30b59a7c70186452d3aee5d0e9c09f9832ce Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Tue, 12 Mar 2024 19:58:54 -0700 Subject: [PATCH] schemaElement returns trees --- .gitignore | 1 - package.json | 8 ++++---- src/column.js | 4 ++-- src/datapage.js | 4 ++-- src/datapageV2.js | 4 ++-- src/metadata.js | 4 ++-- src/schema.js | 10 +++++----- test/schema.test.js | 6 +++++- 8 files changed, 22 insertions(+), 19 deletions(-) diff --git a/.gitignore b/.gitignore index dc9f5a2..e1f5d4a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,6 @@ node_modules package-lock.json coverage -dist *.tgz example.parquet .vscode diff --git a/package.json b/package.json index 52596e6..485b3c8 100644 --- a/package.json +++ b/package.json @@ -27,15 +27,15 @@ "typecheck": "tsc" }, "devDependencies": { - "@types/node": "20.11.21", - "@typescript-eslint/eslint-plugin": "7.1.0", + "@types/node": "20.11.26", + "@typescript-eslint/eslint-plugin": "7.2.0", "@vitest/coverage-v8": "1.3.1", "eslint": "8.57.0", "eslint-plugin-import": "2.29.1", - "eslint-plugin-jsdoc": "48.2.0", + "eslint-plugin-jsdoc": "48.2.1", "http-server": "14.1.1", "hysnappy": "0.3.0", - "typescript": "5.3.3", + "typescript": "5.4.2", "vitest": "1.3.1" } } diff --git a/src/column.js b/src/column.js index 143dc29..a914408 100644 --- a/src/column.js +++ b/src/column.js @@ -79,10 +79,10 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata, } else { if (dictionaryEncoding && dictionary) { dereferenceDictionary(dictionary, dataPage) - values = convert(dataPage, schemaElement(schema, columnMetadata.path_in_schema)) + values = convert(dataPage, schemaElement(schema, columnMetadata.path_in_schema).element) } else if (Array.isArray(dataPage)) { // convert primitive types to rich types - values = convert(dataPage, schemaElement(schema, columnMetadata.path_in_schema)) + values = convert(dataPage, schemaElement(schema, columnMetadata.path_in_schema).element) } else { values = dataPage // TODO: data page shouldn't be a fixed byte array? } diff --git a/src/datapage.js b/src/datapage.js index 84433d8..a3eb672 100644 --- a/src/datapage.js +++ b/src/datapage.js @@ -61,8 +61,8 @@ export function readDataPage(bytes, daph, schema, columnMetadata) { // read values based on encoding const nValues = daph.num_values - numNulls if (daph.encoding === 'PLAIN') { - const se = schemaElement(schema, columnMetadata.path_in_schema) - const utf8 = se.converted_type === 'UTF8' + const { element } = schemaElement(schema, columnMetadata.path_in_schema) + const utf8 = element.converted_type === 'UTF8' const plainObj = readPlain(dataView, columnMetadata.type, nValues, offset, utf8) values = Array.isArray(plainObj.value) ? plainObj.value : Array.from(plainObj.value) offset += plainObj.byteLength diff --git a/src/datapageV2.js b/src/datapageV2.js index 4f1d51e..d8c6181 100644 --- a/src/datapageV2.js +++ b/src/datapageV2.js @@ -47,8 +47,8 @@ export function readDataPageV2(compressedBytes, ph, schema, columnMetadata, comp // read values based on encoding const nValues = daph2.num_values - daph2.num_nulls if (daph2.encoding === 'PLAIN') { - const se = schemaElement(schema, columnMetadata.path_in_schema) - const utf8 = se.converted_type === 'UTF8' + const { element } = schemaElement(schema, columnMetadata.path_in_schema) + const utf8 = element.converted_type === 'UTF8' let page = compressedBytes.slice(offset) if (daph2.is_compressed && columnMetadata.codec !== 'UNCOMPRESSED') { page = decompressPage(page, uncompressedPageSize, columnMetadata.codec, compressors) diff --git a/src/metadata.js b/src/metadata.js index 317ef41..22ed256 100644 --- a/src/metadata.js +++ b/src/metadata.js @@ -1,5 +1,5 @@ import { CompressionCodec, ConvertedType, Encoding, FieldRepetitionType, ParquetType } from './constants.js' -import { schemaTree } from './schema.js' +import { schemaElement } from './schema.js' import { deserializeTCompactProtocol } from './thrift.js' /** @@ -170,7 +170,7 @@ export function parquetMetadata(arrayBuffer) { * @returns {SchemaTree} tree of schema elements */ export function parquetSchema(metadata) { - return schemaTree(metadata.schema, 0) + return schemaElement(metadata.schema, []) } /** diff --git a/src/schema.js b/src/schema.js index 347ea14..256d925 100644 --- a/src/schema.js +++ b/src/schema.js @@ -10,7 +10,7 @@ * @param {number} rootIndex index of the root element * @returns {SchemaTree} tree of schema elements */ -export function schemaTree(schema, rootIndex) { +function schemaTree(schema, rootIndex) { const root = schema[rootIndex] const children = [] let count = 1 @@ -32,7 +32,7 @@ export function schemaTree(schema, rootIndex) { * * @param {SchemaElement[]} schema * @param {string[]} name path to the element - * @returns {SchemaElement} schema element + * @returns {SchemaTree} schema element */ export function schemaElement(schema, name) { let tree = schemaTree(schema, 0) @@ -42,7 +42,7 @@ export function schemaElement(schema, name) { if (!child) throw new Error(`parquet schema element not found: ${name}`) tree = child } - return tree.element + return tree } /** @@ -77,7 +77,7 @@ export function isRequired(schema, name) { export function getMaxRepetitionLevel(schema, parts) { let maxLevel = 0 parts.forEach((part, i) => { - const element = schemaElement(schema, parts.slice(0, i + 1)) + const { element } = schemaElement(schema, parts.slice(0, i + 1)) if (element.repetition_type === 'REPEATED') { maxLevel += 1 } @@ -95,7 +95,7 @@ export function getMaxRepetitionLevel(schema, parts) { export function getMaxDefinitionLevel(schema, parts) { let maxLevel = 0 parts.forEach((part, i) => { - const element = schemaElement(schema, parts.slice(0, i + 1)) + const { element } = schemaElement(schema, parts.slice(0, i + 1)) if (element.repetition_type !== 'REQUIRED') { maxLevel += 1 } diff --git a/test/schema.test.js b/test/schema.test.js index b2effc9..39e7b9f 100644 --- a/test/schema.test.js +++ b/test/schema.test.js @@ -20,7 +20,11 @@ describe('Parquet schema utils', () => { describe('schemaElement', () => { it('should return the correct schema element', () => { - expect(schemaElement(schema, ['child1'])).toEqual(schema[1]) + expect(schemaElement(schema, ['child1'])).toEqual({ + children: [], + count: 1, + element: { name: 'child1', repetition_type: 'OPTIONAL' }, + }) }) it('should throw an error if element not found', () => {