hyparquet/src/assemble.js

249 lines
7.9 KiB
JavaScript
Raw Normal View History

import { getMaxDefinitionLevel, isListLike, isMapLike } from './schema.js'
2024-05-18 05:44:03 +00:00
2024-03-18 23:36:16 +00:00
/**
* Reconstructs a complex nested structure from flat arrays of values and
* definition and repetition levels, according to Dremel encoding.
2024-03-18 23:36:16 +00:00
*
* @import {DecodedArray} from '../src/types.d.ts'
2024-06-08 02:30:30 +00:00
* @param {any[]} output
2024-05-18 02:41:40 +00:00
* @param {number[] | undefined} definitionLevels
* @param {number[]} repetitionLevels
* @param {DecodedArray} values
* @param {SchemaTree[]} schemaPath
* @returns {DecodedArray}
2024-03-18 23:36:16 +00:00
*/
export function assembleLists(output, definitionLevels, repetitionLevels, values, schemaPath) {
2024-05-18 02:41:40 +00:00
const n = definitionLevels?.length || repetitionLevels.length
if (!n) return values
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath)
const repetitionPath = schemaPath.map(({ element }) => element.repetition_type)
2024-03-19 06:54:58 +00:00
let valueIndex = 0
2024-03-18 23:36:16 +00:00
2024-05-18 02:41:40 +00:00
// Track state of nested structures
2024-03-21 00:24:25 +00:00
const containerStack = [output]
2024-05-18 02:41:40 +00:00
let currentContainer = output
2024-05-18 05:44:03 +00:00
let currentDepth = 0 // schema depth
let currentDefLevel = 0 // list depth
let currentRepLevel = 0
2024-03-18 23:36:16 +00:00
2024-06-08 02:30:30 +00:00
if (repetitionLevels[0]) {
// continue previous row
while (currentDepth < repetitionPath.length - 2 && currentRepLevel < repetitionLevels[0]) {
if (!currentContainer) throw new Error('parquet cannot resume previous page')
2024-06-08 02:30:30 +00:00
// go into last list
currentContainer = currentContainer.at(-1)
containerStack.push(currentContainer)
currentDepth++
if (repetitionPath[currentDepth] !== 'REQUIRED') currentDefLevel++
if (repetitionPath[currentDepth] === 'REPEATED') currentRepLevel++
}
}
2024-05-18 02:41:40 +00:00
for (let i = 0; i < n; i++) {
2024-05-18 05:44:03 +00:00
// assert(currentDefLevel === containerStack.length - 1)
2024-03-21 00:24:25 +00:00
const def = definitionLevels?.length ? definitionLevels[i] : maxDefinitionLevel
const rep = repetitionLevels[i]
2024-05-18 05:44:03 +00:00
// Pop up to start of rep level
2024-05-24 05:11:47 +00:00
while (currentDepth && (rep < currentRepLevel || repetitionPath[currentDepth] !== 'REPEATED')) {
2024-05-18 05:44:03 +00:00
if (repetitionPath[currentDepth] !== 'REQUIRED') {
2024-03-21 00:24:25 +00:00
containerStack.pop()
2024-05-18 05:44:03 +00:00
currentDefLevel--
2024-03-21 00:24:25 +00:00
}
2024-05-18 05:44:03 +00:00
if (repetitionPath[currentDepth] === 'REPEATED') currentRepLevel--
currentDepth--
2024-04-29 02:03:39 +00:00
}
2024-05-18 05:44:03 +00:00
// @ts-expect-error won't be empty
currentContainer = containerStack.at(-1)
2024-04-29 02:03:39 +00:00
2024-05-18 05:44:03 +00:00
// Go deeper to end of definition level
while (
(currentDepth < repetitionPath.length - 2 || repetitionPath[currentDepth + 1] === 'REPEATED') &&
(currentDefLevel < def || repetitionPath[currentDepth + 1] === 'REQUIRED')
) {
2024-05-18 05:44:03 +00:00
currentDepth++
if (repetitionPath[currentDepth] !== 'REQUIRED') {
/** @type {any[]} */
const newList = []
currentContainer.push(newList)
currentContainer = newList
containerStack.push(newList)
currentDefLevel++
}
if (repetitionPath[currentDepth] === 'REPEATED') currentRepLevel++
2024-03-18 23:36:16 +00:00
}
2024-03-21 00:24:25 +00:00
// Add value or null based on definition level
2024-03-18 23:36:16 +00:00
if (def === maxDefinitionLevel) {
2024-05-18 05:44:03 +00:00
// assert(currentDepth === maxDefinitionLevel || currentDepth === repetitionPath.length - 2)
2024-03-21 00:24:25 +00:00
currentContainer.push(values[valueIndex++])
2024-05-18 05:44:03 +00:00
} else if (currentDepth === repetitionPath.length - 2) {
currentContainer.push(null)
} else {
currentContainer.push([])
2024-03-18 23:36:16 +00:00
}
}
2024-03-21 00:24:25 +00:00
// Handle edge cases for empty inputs or single-level data
2024-06-08 02:30:30 +00:00
if (!output.length) {
2024-03-21 00:24:25 +00:00
// return max definition level of nested lists
for (let i = 0; i < maxDefinitionLevel; i++) {
/** @type {any[]} */
const newList = []
currentContainer.push(newList)
currentContainer = newList
}
2024-03-18 23:36:16 +00:00
}
2024-03-19 06:54:58 +00:00
return output
2024-03-18 23:36:16 +00:00
}
2024-05-18 05:44:03 +00:00
/**
* Assemble a nested structure from subcolumn data.
* https://github.com/apache/parquet-format/blob/apache-parquet-format-2.10.0/LogicalTypes.md#nested-types
*
* @import {SchemaTree} from '../src/types.d.ts'
* @param {Map<string, DecodedArray>} subcolumnData
2024-05-18 05:44:03 +00:00
* @param {SchemaTree} schema top-level schema element
* @param {number} [depth] depth of nested structure
*/
export function assembleNested(subcolumnData, schema, depth = 0) {
const path = schema.path.join('.')
const optional = schema.element.repetition_type === 'OPTIONAL'
const nextDepth = optional ? depth + 1 : depth
if (isListLike(schema)) {
2024-05-24 01:26:16 +00:00
let sublist = schema.children[0]
let subDepth = nextDepth
if (sublist.children.length === 1) {
sublist = sublist.children[0]
subDepth++
}
assembleNested(subcolumnData, sublist, subDepth)
2024-05-18 05:44:03 +00:00
const subcolumn = sublist.path.join('.')
const values = subcolumnData.get(subcolumn)
if (!values) throw new Error('parquet list column missing values')
2024-05-18 05:44:03 +00:00
if (optional) flattenAtDepth(values, depth)
subcolumnData.set(path, values)
subcolumnData.delete(subcolumn)
return
}
if (isMapLike(schema)) {
const mapName = schema.children[0].element.name
// Assemble keys and values
assembleNested(subcolumnData, schema.children[0].children[0], nextDepth + 1)
assembleNested(subcolumnData, schema.children[0].children[1], nextDepth + 1)
const keys = subcolumnData.get(`${path}.${mapName}.key`)
const values = subcolumnData.get(`${path}.${mapName}.value`)
if (!keys) throw new Error('parquet map column missing keys')
if (!values) throw new Error('parquet map column missing values')
2024-05-18 05:44:03 +00:00
if (keys.length !== values.length) {
throw new Error('parquet map column key/value length mismatch')
2024-05-18 05:44:03 +00:00
}
const out = assembleMaps(keys, values, nextDepth)
if (optional) flattenAtDepth(out, depth)
subcolumnData.delete(`${path}.${mapName}.key`)
subcolumnData.delete(`${path}.${mapName}.value`)
subcolumnData.set(path, out)
return
}
// Struct-like column
if (schema.children.length) {
// construct a meta struct and then invert
const invertDepth = schema.element.repetition_type === 'REQUIRED' ? depth : depth + 1
2024-05-18 05:44:03 +00:00
/** @type {Record<string, any>} */
const struct = {}
for (const child of schema.children) {
assembleNested(subcolumnData, child, invertDepth)
2024-05-18 05:44:03 +00:00
const childData = subcolumnData.get(child.path.join('.'))
if (!childData) throw new Error('parquet struct missing child data')
2024-05-18 05:44:03 +00:00
struct[child.element.name] = childData
}
// remove children
for (const child of schema.children) {
subcolumnData.delete(child.path.join('.'))
}
// invert struct by depth
2024-05-21 06:09:31 +00:00
const inverted = invertStruct(struct, invertDepth)
2024-05-20 11:59:30 +00:00
if (optional) flattenAtDepth(inverted, depth)
subcolumnData.set(path, inverted)
2024-05-18 05:44:03 +00:00
}
}
/**
* @param {DecodedArray} arr
2024-05-18 05:44:03 +00:00
* @param {number} depth
*/
function flattenAtDepth(arr, depth) {
for (let i = 0; i < arr.length; i++) {
if (depth) {
flattenAtDepth(arr[i], depth - 1)
} else {
arr[i] = arr[i][0]
}
}
}
/**
* @param {DecodedArray} keys
* @param {DecodedArray} values
2024-05-18 05:44:03 +00:00
* @param {number} depth
* @returns {any[]}
*/
function assembleMaps(keys, values, depth) {
const out = []
for (let i = 0; i < keys.length; i++) {
if (depth) {
out.push(assembleMaps(keys[i], values[i], depth - 1)) // go deeper
} else {
if (keys[i]) {
/** @type {Record<string, any>} */
const obj = {}
for (let j = 0; j < keys[i].length; j++) {
const value = values[i][j]
obj[keys[i][j]] = value === undefined ? null : value
}
out.push(obj)
} else {
out.push(undefined)
}
}
}
return out
}
/**
* Invert a struct-like object by depth.
*
* @param {Record<string, any[]>} struct
* @param {number} depth
* @returns {any[]}
*/
function invertStruct(struct, depth) {
const keys = Object.keys(struct)
const length = struct[keys[0]]?.length
const out = []
for (let i = 0; i < length; i++) {
/** @type {Record<string, any>} */
const obj = {}
for (const key of keys) {
if (struct[key].length !== length) throw new Error('parquet struct parsing error')
2024-05-18 05:44:03 +00:00
obj[key] = struct[key][i]
}
if (depth) {
out.push(invertStruct(obj, depth - 1)) // deeper
} else {
out.push(obj)
}
}
return out
}