hyparquet/src/schema.js

193 lines
5.2 KiB
JavaScript
Raw Normal View History

2024-01-07 04:27:18 +00:00
/**
* @typedef {import('./types.js').SchemaElement} SchemaElement
2024-01-20 02:51:16 +00:00
* @typedef {import('./types.js').SchemaTree} SchemaTree
2024-01-07 04:27:18 +00:00
*/
/**
* Build a tree from the schema elements.
*
* @param {SchemaElement[]} schema
2024-01-20 02:51:16 +00:00
* @param {number} rootIndex index of the root element
2024-01-07 04:27:18 +00:00
* @returns {SchemaTree} tree of schema elements
*/
2024-03-13 02:58:54 +00:00
function schemaTree(schema, rootIndex) {
2024-01-20 02:51:16 +00:00
const root = schema[rootIndex]
2024-01-07 04:27:18 +00:00
const children = []
2024-01-20 02:51:16 +00:00
let count = 1
2024-01-07 04:27:18 +00:00
// Read the specified number of children
if (root.num_children) {
while (children.length < root.num_children) {
2024-01-20 02:51:16 +00:00
const child = schemaTree(schema, rootIndex + count)
count += child.count
2024-01-07 04:27:18 +00:00
children.push(child)
}
}
2024-01-20 02:51:16 +00:00
return { count, element: root, children }
2024-01-07 04:27:18 +00:00
}
/**
* Get the schema element with the given name.
*
* @param {SchemaElement[]} schema
* @param {string[]} name path to the element
2024-03-13 02:58:54 +00:00
* @returns {SchemaTree} schema element
2024-01-07 04:27:18 +00:00
*/
2024-04-30 00:38:26 +00:00
function schemaElement(schema, name) {
2024-01-07 04:27:18 +00:00
let tree = schemaTree(schema, 0)
// traverse the tree to find the element
for (const part of name) {
const child = tree.children.find(child => child.element.name === part)
if (!child) throw new Error(`parquet schema element not found: ${name}`)
2024-01-07 04:27:18 +00:00
tree = child
}
2024-03-13 02:58:54 +00:00
return tree
2024-01-07 04:27:18 +00:00
}
/**
2024-04-30 00:38:26 +00:00
* Get each schema element from the root to the given element name.
2024-01-07 04:27:18 +00:00
*
* @param {SchemaElement[]} schema
* @param {string[]} name path to the element
2024-04-30 00:38:26 +00:00
* @returns {SchemaTree[]} schema element
2024-01-07 04:27:18 +00:00
*/
2024-04-30 00:38:26 +00:00
export function getSchemaPath(schema, name) {
let tree = schemaTree(schema, 0)
2024-04-30 00:38:26 +00:00
const path = [tree]
for (const part of name) {
const child = tree.children.find(child => child.element.name === part)
if (!child) throw new Error(`parquet schema element not found: ${name}`)
path.push(child)
tree = child
}
return path
}
/**
* Check if the schema element with the given name is required.
* An element is required if all of its ancestors are required.
*
* @param {SchemaTree[]} schemaPath
* @returns {boolean} true if the element is required
*/
export function isRequired(schemaPath) {
for (const { element } of schemaPath.slice(1)) {
if (element.repetition_type !== 'REQUIRED') {
return false
}
}
return true
2024-01-07 04:27:18 +00:00
}
/**
* Get the max repetition level for a given schema path.
*
2024-04-30 00:38:26 +00:00
* @param {SchemaTree[]} schemaPath
2024-01-07 04:27:18 +00:00
* @returns {number} max repetition level
*/
2024-04-30 00:38:26 +00:00
export function getMaxRepetitionLevel(schemaPath) {
2024-01-07 04:27:18 +00:00
let maxLevel = 0
2024-04-30 00:38:26 +00:00
for (const { element } of schemaPath.slice(1)) {
2024-02-11 22:33:56 +00:00
if (element.repetition_type === 'REPEATED') {
maxLevel++
2024-01-07 04:27:18 +00:00
}
2024-04-30 00:38:26 +00:00
}
2024-01-07 04:27:18 +00:00
return maxLevel
}
/**
* Get the max definition level for a given schema path.
*
2024-04-30 00:38:26 +00:00
* @param {SchemaTree[]} schemaPath
2024-01-07 04:27:18 +00:00
* @returns {number} max definition level
*/
2024-04-30 00:38:26 +00:00
export function getMaxDefinitionLevel(schemaPath) {
2024-01-07 04:27:18 +00:00
let maxLevel = 0
2024-04-30 00:38:26 +00:00
for (const { element } of schemaPath.slice(1)) {
2024-02-11 22:33:56 +00:00
if (element.repetition_type !== 'REQUIRED') {
maxLevel++
2024-01-07 04:27:18 +00:00
}
2024-04-30 00:38:26 +00:00
}
2024-01-07 04:27:18 +00:00
return maxLevel
}
/**
* Get the number of bytes to skip for definition levels.
*
* @param {number} num number of values
* @returns {number} number of bytes to skip
*/
export function skipDefinitionBytes(num) {
let byteLength = 6
let n = num >>> 8
while (n !== 0) {
byteLength++
2024-01-07 04:27:18 +00:00
n >>>= 7
}
return byteLength
}
2024-03-14 23:39:03 +00:00
/**
* Get the column name as foo.bar and handle list-like columns.
* @param {SchemaElement[]} schema
* @param {string[]} path
* @returns {string} column name
*/
export function getColumnName(schema, path) {
2024-03-12 02:35:57 +00:00
if (isListLike(schema, path) || isMapLike(schema, path)) {
2024-03-14 23:39:03 +00:00
return path.slice(0, -2).join('.')
} else {
return path.join('.')
}
}
/**
* Check if a column is list-like.
*
* @param {SchemaElement[]} schemaElements parquet schema elements
* @param {string[]} path column path
* @returns {boolean} true if map-like
*/
2024-04-29 23:47:52 +00:00
export function isListLike(schemaElements, path) {
2024-03-14 23:39:03 +00:00
const schema = schemaElement(schemaElements, path.slice(0, -2))
if (path.length < 3) return false
if (schema.element.converted_type !== 'LIST') return false
if (schema.children.length > 1) return false
const firstChild = schema.children[0]
if (firstChild.children.length > 1) return false
if (firstChild.element.repetition_type !== 'REPEATED') return false
const secondChild = firstChild.children[0]
if (secondChild.element.repetition_type !== 'REQUIRED') return false
return true
}
2024-03-12 02:35:57 +00:00
/**
* Check if a column is map-like.
*
* @param {SchemaElement[]} schemaElements parquet schema elements
* @param {string[]} path column path
* @returns {boolean} true if map-like
*/
export function isMapLike(schemaElements, path) {
const schema = schemaElement(schemaElements, path.slice(0, -2))
if (path.length < 3) return false
if (schema.element.converted_type !== 'MAP') return false
if (schema.children.length > 1) return false
const firstChild = schema.children[0]
if (firstChild.children.length !== 2) return false
if (firstChild.element.repetition_type !== 'REPEATED') return false
const keyChild = firstChild.children.find(child => child.element.name === 'key')
if (keyChild?.element.repetition_type !== 'REQUIRED') return false
const valueChild = firstChild.children.find(child => child.element.name === 'value')
if (valueChild?.element.repetition_type === 'REPEATED') return false
return true
}