schemaElement returns trees

This commit is contained in:
Kenny Daniel 2024-03-12 19:58:54 -07:00
parent 8a98407734
commit c6ad30b59a
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
8 changed files with 22 additions and 19 deletions

1
.gitignore vendored

@ -1,7 +1,6 @@
node_modules
package-lock.json
coverage
dist
*.tgz
example.parquet
.vscode

@ -27,15 +27,15 @@
"typecheck": "tsc"
},
"devDependencies": {
"@types/node": "20.11.21",
"@typescript-eslint/eslint-plugin": "7.1.0",
"@types/node": "20.11.26",
"@typescript-eslint/eslint-plugin": "7.2.0",
"@vitest/coverage-v8": "1.3.1",
"eslint": "8.57.0",
"eslint-plugin-import": "2.29.1",
"eslint-plugin-jsdoc": "48.2.0",
"eslint-plugin-jsdoc": "48.2.1",
"http-server": "14.1.1",
"hysnappy": "0.3.0",
"typescript": "5.3.3",
"typescript": "5.4.2",
"vitest": "1.3.1"
}
}

@ -79,10 +79,10 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata,
} else {
if (dictionaryEncoding && dictionary) {
dereferenceDictionary(dictionary, dataPage)
values = convert(dataPage, schemaElement(schema, columnMetadata.path_in_schema))
values = convert(dataPage, schemaElement(schema, columnMetadata.path_in_schema).element)
} else if (Array.isArray(dataPage)) {
// convert primitive types to rich types
values = convert(dataPage, schemaElement(schema, columnMetadata.path_in_schema))
values = convert(dataPage, schemaElement(schema, columnMetadata.path_in_schema).element)
} else {
values = dataPage // TODO: data page shouldn't be a fixed byte array?
}

@ -61,8 +61,8 @@ export function readDataPage(bytes, daph, schema, columnMetadata) {
// read values based on encoding
const nValues = daph.num_values - numNulls
if (daph.encoding === 'PLAIN') {
const se = schemaElement(schema, columnMetadata.path_in_schema)
const utf8 = se.converted_type === 'UTF8'
const { element } = schemaElement(schema, columnMetadata.path_in_schema)
const utf8 = element.converted_type === 'UTF8'
const plainObj = readPlain(dataView, columnMetadata.type, nValues, offset, utf8)
values = Array.isArray(plainObj.value) ? plainObj.value : Array.from(plainObj.value)
offset += plainObj.byteLength

@ -47,8 +47,8 @@ export function readDataPageV2(compressedBytes, ph, schema, columnMetadata, comp
// read values based on encoding
const nValues = daph2.num_values - daph2.num_nulls
if (daph2.encoding === 'PLAIN') {
const se = schemaElement(schema, columnMetadata.path_in_schema)
const utf8 = se.converted_type === 'UTF8'
const { element } = schemaElement(schema, columnMetadata.path_in_schema)
const utf8 = element.converted_type === 'UTF8'
let page = compressedBytes.slice(offset)
if (daph2.is_compressed && columnMetadata.codec !== 'UNCOMPRESSED') {
page = decompressPage(page, uncompressedPageSize, columnMetadata.codec, compressors)

@ -1,5 +1,5 @@
import { CompressionCodec, ConvertedType, Encoding, FieldRepetitionType, ParquetType } from './constants.js'
import { schemaTree } from './schema.js'
import { schemaElement } from './schema.js'
import { deserializeTCompactProtocol } from './thrift.js'
/**
@ -170,7 +170,7 @@ export function parquetMetadata(arrayBuffer) {
* @returns {SchemaTree} tree of schema elements
*/
export function parquetSchema(metadata) {
return schemaTree(metadata.schema, 0)
return schemaElement(metadata.schema, [])
}
/**

@ -10,7 +10,7 @@
* @param {number} rootIndex index of the root element
* @returns {SchemaTree} tree of schema elements
*/
export function schemaTree(schema, rootIndex) {
function schemaTree(schema, rootIndex) {
const root = schema[rootIndex]
const children = []
let count = 1
@ -32,7 +32,7 @@ export function schemaTree(schema, rootIndex) {
*
* @param {SchemaElement[]} schema
* @param {string[]} name path to the element
* @returns {SchemaElement} schema element
* @returns {SchemaTree} schema element
*/
export function schemaElement(schema, name) {
let tree = schemaTree(schema, 0)
@ -42,7 +42,7 @@ export function schemaElement(schema, name) {
if (!child) throw new Error(`parquet schema element not found: ${name}`)
tree = child
}
return tree.element
return tree
}
/**
@ -77,7 +77,7 @@ export function isRequired(schema, name) {
export function getMaxRepetitionLevel(schema, parts) {
let maxLevel = 0
parts.forEach((part, i) => {
const element = schemaElement(schema, parts.slice(0, i + 1))
const { element } = schemaElement(schema, parts.slice(0, i + 1))
if (element.repetition_type === 'REPEATED') {
maxLevel += 1
}
@ -95,7 +95,7 @@ export function getMaxRepetitionLevel(schema, parts) {
export function getMaxDefinitionLevel(schema, parts) {
let maxLevel = 0
parts.forEach((part, i) => {
const element = schemaElement(schema, parts.slice(0, i + 1))
const { element } = schemaElement(schema, parts.slice(0, i + 1))
if (element.repetition_type !== 'REQUIRED') {
maxLevel += 1
}

@ -20,7 +20,11 @@ describe('Parquet schema utils', () => {
describe('schemaElement', () => {
it('should return the correct schema element', () => {
expect(schemaElement(schema, ['child1'])).toEqual(schema[1])
expect(schemaElement(schema, ['child1'])).toEqual({
children: [],
count: 1,
element: { name: 'child1', repetition_type: 'OPTIONAL' },
})
})
it('should throw an error if element not found', () => {