Rewrite dremel assembly

This commit is contained in:
Kenny Daniel 2024-04-28 19:03:39 -07:00
parent 71dd68540d
commit 54ef1e6b0a
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
4 changed files with 43 additions and 30 deletions

@ -202,6 +202,7 @@ Parsing a [420mb wikipedia parquet file](https://huggingface.co/datasets/wikimed
- https://github.com/apache/thrift
- https://github.com/apache/arrow
- https://github.com/dask/fastparquet
- https://github.com/duckdb/duckdb
- https://github.com/google/snappy
- https://github.com/ironSource/parquetjs
- https://github.com/zhipeng-jia/snappyjs

@ -2,11 +2,10 @@
* Dremel-assembly of arrays of values into lists
*
* Reconstructs a complex nested structure from flat arrays of definition and repetition levels,
* according to Dremel encoding. This simplified version focuses on arrays and scalar values,
* with optional support for null values.
* according to Dremel encoding.
*
* @param {number[] | undefined} definitionLevels definition levels, max 3
* @param {number[]} repetitionLevels repetition levels, max 1
* @param {number[] | undefined} definitionLevels definition levels
* @param {number[]} repetitionLevels repetition levels
* @param {ArrayLike<any>} values values to process
* @param {boolean} isNullable can entries be null?
* @param {number} maxDefinitionLevel definition level that corresponds to non-null
@ -36,36 +35,27 @@ export function assembleObjects(
// Construct new lists up to max repetition level
// @ts-expect-error won't be empty
currentContainer = containerStack.at(-1)
if (def) {
for (let j = rep; j < maxRepetitionLevel; j++) {
/** @type {any[]} */
const newList = []
currentContainer.push(newList)
currentContainer = newList
containerStack.push(newList)
}
}
}
// Add lists up to definition level
const targetDepth = isNullable ? (def + 1) / 2 : maxRepetitionLevel + 1
for (let j = containerStack.length; j < targetDepth; j++) {
/** @type {any[]} */
const newList = []
currentContainer.push(newList)
currentContainer = newList
containerStack.push(newList)
}
// Add value or null based on definition level
if (def === maxDefinitionLevel) {
if (!currentContainer) {
throw new Error('parquet assembleObjects: currentContainer is undefined')
}
currentContainer.push(values[valueIndex++])
} else if (isNullable) {
if (def) {
// TODO: Go up maxDefinitionLevel - def - 1 levels to add null
for (let j = def; j < maxDefinitionLevel - 1; j++) {
containerStack.pop()
// @ts-expect-error won't be empty
currentContainer = containerStack.at(-1)
}
if (def > 1) {
currentContainer.push(undefined)
}
} else {
// TODO: actually depends on level required or not
if (def % 2 === 0) {
currentContainer.push(undefined)
} else {
currentContainer.push([])
}
}
}

@ -182,10 +182,14 @@ async function readRowGroup(options, rowGroup, groupStart) {
const obj = {}
for (let j = 0; j < keys[i].length; j++) {
if (Array.isArray(keys[i][j])) {
// TODO: key should not be an array, this is an assemble bug
// TODO: key should not be an array, this is an assemble bug?
keys[i][j] = keys[i][j][0]
values[i][j] = values[i][j][0]
}
if (keys[i][j] instanceof Uint8Array) {
// decode utf-8 keys
keys[i][j] = new TextDecoder().decode(keys[i][j])
}
if (!keys[i][j]) continue
obj[keys[i][j]] = values[i][j] === undefined ? null : values[i][j]
}

@ -83,14 +83,14 @@ describe('assembleObjects', () => {
expect(result).toEqual([[[]]])
})
it('should handle isNull', () => {
it('should handle nonnullable lists', () => {
// from nonnullable.impala.parquet
const result = assembleObjects([2], [0], [-1], false, 2, 2)
expect(result).toEqual([[[-1]]])
})
it('should handle nullable int_array', () => {
// from nullable.impala.parquet
// from nullable.impala.parquet int_array
// [1 2 3][N 1 2 N 3 N][ ] N N
const definitionLevels = [3, 3, 3, 2, 3, 3, 2, 3, 2, 1, 0, 0]
const repetitionLevels = [0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0]
@ -104,4 +104,22 @@ describe('assembleObjects', () => {
undefined,
])
})
it('should handle nullable int_array_Array', () => {
// from nullable.impala.parquet int_array_Array
// [1 2][3 4][[N 1 2 N][3 N 4] [] N][N] [] N N [N 5 6]
const definitionLevels = [5, 5, 5, 5, 4, 5, 5, 4, 5, 4, 5, 3, 2, 2, 1, 0, 0, 2, 5, 5]
const repetitionLevels = [0, 2, 1, 2, 0, 2, 2, 2, 1, 2, 2, 1, 1, 0, 0, 0, 0, 0, 1, 2]
const values = [1, 2, 3, 4, 1, 2, 3, 4, 5, 6]
const result = assembleObjects(definitionLevels, repetitionLevels, values, true, 5, 2)
expect(result).toEqual([
[[1, 2], [3, 4]],
[[undefined, 1, 2, undefined], [3, undefined, 4], [], undefined],
[undefined],
[],
undefined,
undefined,
[undefined, [5, 6]],
])
})
})