hyparquet/src/assemble.js

237 lines
7.4 KiB
JavaScript
Raw Normal View History

2024-05-18 05:44:03 +00:00
import { isListLike, isMapLike } from './schema.js'
2024-03-18 23:36:16 +00:00
/**
* Dremel-assembly of arrays of values into lists
*
2024-03-21 00:24:25 +00:00
* Reconstructs a complex nested structure from flat arrays of definition and repetition levels,
2024-04-29 02:03:39 +00:00
* according to Dremel encoding.
2024-03-21 00:24:25 +00:00
*
2024-05-18 02:41:40 +00:00
* @typedef {import('./types.d.ts').DecodedArray} DecodedArray
2024-05-18 05:44:03 +00:00
* @typedef {import('./types.d.ts').FieldRepetitionType} FieldRepetitionType
2024-05-18 02:41:40 +00:00
* @param {number[] | undefined} definitionLevels
* @param {number[]} repetitionLevels
* @param {DecodedArray} values
2024-05-18 05:44:03 +00:00
* @param {(FieldRepetitionType | undefined)[]} repetitionPath
2024-03-18 23:36:16 +00:00
* @param {number} maxDefinitionLevel definition level that corresponds to non-null
2024-03-21 00:24:25 +00:00
* @param {number} maxRepetitionLevel repetition level that corresponds to a new row
2024-05-18 02:41:40 +00:00
* @returns {DecodedArray} array of values
2024-03-18 23:36:16 +00:00
*/
2024-05-18 02:41:40 +00:00
export function assembleLists(
2024-05-18 05:44:03 +00:00
definitionLevels, repetitionLevels, values, repetitionPath, maxDefinitionLevel, maxRepetitionLevel
2024-03-18 23:36:16 +00:00
) {
2024-05-18 02:41:40 +00:00
const n = definitionLevels?.length || repetitionLevels.length
2024-03-19 06:54:58 +00:00
let valueIndex = 0
2024-03-18 23:36:16 +00:00
/** @type {any[]} */
2024-03-19 06:54:58 +00:00
const output = []
2024-03-18 23:36:16 +00:00
2024-05-18 02:41:40 +00:00
// Track state of nested structures
2024-03-21 00:24:25 +00:00
const containerStack = [output]
2024-05-18 02:41:40 +00:00
let currentContainer = output
2024-05-18 05:44:03 +00:00
let currentDepth = 0 // schema depth
let currentDefLevel = 0 // list depth
let currentRepLevel = 0
2024-03-18 23:36:16 +00:00
2024-05-18 02:41:40 +00:00
for (let i = 0; i < n; i++) {
2024-05-18 05:44:03 +00:00
// assert(currentDefLevel === containerStack.length - 1)
2024-03-21 00:24:25 +00:00
const def = definitionLevels?.length ? definitionLevels[i] : maxDefinitionLevel
const rep = repetitionLevels[i]
2024-05-18 05:44:03 +00:00
// Pop up to start of rep level
while (currentDepth && (rep < currentRepLevel || repetitionPath[currentDepth] === 'OPTIONAL')) {
if (repetitionPath[currentDepth] !== 'REQUIRED') {
2024-03-21 00:24:25 +00:00
containerStack.pop()
2024-05-18 05:44:03 +00:00
currentDefLevel--
2024-03-21 00:24:25 +00:00
}
2024-05-18 05:44:03 +00:00
if (repetitionPath[currentDepth] === 'REPEATED') currentRepLevel--
currentDepth--
2024-04-29 02:03:39 +00:00
}
2024-05-18 05:44:03 +00:00
// @ts-expect-error won't be empty
currentContainer = containerStack.at(-1)
2024-04-29 02:03:39 +00:00
2024-05-18 05:44:03 +00:00
// Go deeper to end of definition level
while (currentDepth < repetitionPath.length - 2 && currentDefLevel < def) {
currentDepth++
if (repetitionPath[currentDepth] !== 'REQUIRED') {
/** @type {any[]} */
const newList = []
currentContainer.push(newList)
currentContainer = newList
containerStack.push(newList)
currentDefLevel++
}
if (repetitionPath[currentDepth] === 'REPEATED') currentRepLevel++
2024-03-18 23:36:16 +00:00
}
2024-03-21 00:24:25 +00:00
// Add value or null based on definition level
2024-03-18 23:36:16 +00:00
if (def === maxDefinitionLevel) {
2024-05-18 05:44:03 +00:00
// assert(currentDepth === maxDefinitionLevel || currentDepth === repetitionPath.length - 2)
2024-03-21 00:24:25 +00:00
currentContainer.push(values[valueIndex++])
2024-05-18 05:44:03 +00:00
} else if (currentDepth === repetitionPath.length - 2) {
currentContainer.push(null)
} else {
currentContainer.push([])
2024-03-18 23:36:16 +00:00
}
}
2024-03-21 00:24:25 +00:00
// Handle edge cases for empty inputs or single-level data
if (output.length === 0) {
if (values.length > 0 && maxRepetitionLevel === 0) {
2024-05-18 05:44:03 +00:00
return values // flat list
2024-03-21 00:24:25 +00:00
}
// return max definition level of nested lists
for (let i = 0; i < maxDefinitionLevel; i++) {
/** @type {any[]} */
const newList = []
currentContainer.push(newList)
currentContainer = newList
}
2024-03-18 23:36:16 +00:00
}
2024-03-19 06:54:58 +00:00
return output
2024-03-18 23:36:16 +00:00
}
2024-05-18 05:44:03 +00:00
/**
* Assemble a nested structure from subcolumn data.
* https://github.com/apache/parquet-format/blob/apache-parquet-format-2.10.0/LogicalTypes.md#nested-types
*
* @typedef {import('./types.d.ts').SchemaTree} SchemaTree
* @param {Map<string, any[]>} subcolumnData
* @param {SchemaTree} schema top-level schema element
* @param {number} [depth] depth of nested structure
*/
export function assembleNested(subcolumnData, schema, depth = 0) {
const path = schema.path.join('.')
const optional = schema.element.repetition_type === 'OPTIONAL'
const nextDepth = optional ? depth + 1 : depth
if (isListLike(schema)) {
const sublist = schema.children[0].children[0]
assembleNested(subcolumnData, sublist, nextDepth + 1)
const subcolumn = sublist.path.join('.')
const values = subcolumnData.get(subcolumn)
if (!values) throw new Error('parquet list-like column missing values')
if (optional) flattenAtDepth(values, depth)
subcolumnData.set(path, values)
subcolumnData.delete(subcolumn)
return
}
if (isMapLike(schema)) {
const mapName = schema.children[0].element.name
// Assemble keys and values
assembleNested(subcolumnData, schema.children[0].children[0], nextDepth + 1)
assembleNested(subcolumnData, schema.children[0].children[1], nextDepth + 1)
const keys = subcolumnData.get(`${path}.${mapName}.key`)
const values = subcolumnData.get(`${path}.${mapName}.value`)
if (!keys) throw new Error('parquet map-like column missing keys')
if (!values) throw new Error('parquet map-like column missing values')
if (keys.length !== values.length) {
throw new Error('parquet map-like column key/value length mismatch')
}
const out = assembleMaps(keys, values, nextDepth)
if (optional) flattenAtDepth(out, depth)
subcolumnData.delete(`${path}.${mapName}.key`)
subcolumnData.delete(`${path}.${mapName}.value`)
subcolumnData.set(path, out)
return
}
// Struct-like column
if (schema.children.length) {
// construct a meta struct and then invert
/** @type {Record<string, any>} */
const struct = {}
for (const child of schema.children) {
assembleNested(subcolumnData, child, nextDepth)
const childData = subcolumnData.get(child.path.join('.'))
if (!childData) throw new Error('parquet struct-like column missing child data')
struct[child.element.name] = childData
}
// remove children
for (const child of schema.children) {
subcolumnData.delete(child.path.join('.'))
}
// invert struct by depth
2024-05-21 06:09:31 +00:00
const invertDepth = schema.element.repetition_type === 'REQUIRED' ? depth : depth + 1
const inverted = invertStruct(struct, invertDepth)
2024-05-20 11:59:30 +00:00
if (optional) flattenAtDepth(inverted, depth)
subcolumnData.set(path, inverted)
2024-05-18 05:44:03 +00:00
return
}
// assert(schema.element.repetition_type !== 'REPEATED')
}
/**
* @param {any[]} arr
* @param {number} depth
*/
function flattenAtDepth(arr, depth) {
for (let i = 0; i < arr.length; i++) {
if (depth) {
flattenAtDepth(arr[i], depth - 1)
} else {
arr[i] = arr[i][0]
}
}
}
/**
* @param {any[]} keys
* @param {any[]} values
* @param {number} depth
* @returns {any[]}
*/
function assembleMaps(keys, values, depth) {
const out = []
for (let i = 0; i < keys.length; i++) {
if (depth) {
out.push(assembleMaps(keys[i], values[i], depth - 1)) // go deeper
} else {
if (keys[i]) {
/** @type {Record<string, any>} */
const obj = {}
for (let j = 0; j < keys[i].length; j++) {
const value = values[i][j]
obj[keys[i][j]] = value === undefined ? null : value
}
out.push(obj)
} else {
out.push(undefined)
}
}
}
return out
}
/**
* Invert a struct-like object by depth.
*
* @param {Record<string, any[]>} struct
* @param {number} depth
* @returns {any[]}
*/
function invertStruct(struct, depth) {
const keys = Object.keys(struct)
const length = struct[keys[0]]?.length
const out = []
for (let i = 0; i < length; i++) {
/** @type {Record<string, any>} */
const obj = {}
for (const key of keys) {
obj[key] = struct[key][i]
}
if (depth) {
out.push(invertStruct(obj, depth - 1)) // deeper
} else {
out.push(obj)
}
}
return out
}