mirror of
https://github.com/asadbek064/hyparquet.git
synced 2025-12-29 08:26:39 +00:00
Rewrite dremel assembly
This commit is contained in:
parent
71dd68540d
commit
54ef1e6b0a
@ -202,6 +202,7 @@ Parsing a [420mb wikipedia parquet file](https://huggingface.co/datasets/wikimed
|
||||
- https://github.com/apache/thrift
|
||||
- https://github.com/apache/arrow
|
||||
- https://github.com/dask/fastparquet
|
||||
- https://github.com/duckdb/duckdb
|
||||
- https://github.com/google/snappy
|
||||
- https://github.com/ironSource/parquetjs
|
||||
- https://github.com/zhipeng-jia/snappyjs
|
||||
|
||||
@ -2,11 +2,10 @@
|
||||
* Dremel-assembly of arrays of values into lists
|
||||
*
|
||||
* Reconstructs a complex nested structure from flat arrays of definition and repetition levels,
|
||||
* according to Dremel encoding. This simplified version focuses on arrays and scalar values,
|
||||
* with optional support for null values.
|
||||
* according to Dremel encoding.
|
||||
*
|
||||
* @param {number[] | undefined} definitionLevels definition levels, max 3
|
||||
* @param {number[]} repetitionLevels repetition levels, max 1
|
||||
* @param {number[] | undefined} definitionLevels definition levels
|
||||
* @param {number[]} repetitionLevels repetition levels
|
||||
* @param {ArrayLike<any>} values values to process
|
||||
* @param {boolean} isNullable can entries be null?
|
||||
* @param {number} maxDefinitionLevel definition level that corresponds to non-null
|
||||
@ -36,36 +35,27 @@ export function assembleObjects(
|
||||
// Construct new lists up to max repetition level
|
||||
// @ts-expect-error won't be empty
|
||||
currentContainer = containerStack.at(-1)
|
||||
if (def) {
|
||||
for (let j = rep; j < maxRepetitionLevel; j++) {
|
||||
/** @type {any[]} */
|
||||
const newList = []
|
||||
currentContainer.push(newList)
|
||||
currentContainer = newList
|
||||
containerStack.push(newList)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Add lists up to definition level
|
||||
const targetDepth = isNullable ? (def + 1) / 2 : maxRepetitionLevel + 1
|
||||
for (let j = containerStack.length; j < targetDepth; j++) {
|
||||
/** @type {any[]} */
|
||||
const newList = []
|
||||
currentContainer.push(newList)
|
||||
currentContainer = newList
|
||||
containerStack.push(newList)
|
||||
}
|
||||
|
||||
// Add value or null based on definition level
|
||||
if (def === maxDefinitionLevel) {
|
||||
if (!currentContainer) {
|
||||
throw new Error('parquet assembleObjects: currentContainer is undefined')
|
||||
}
|
||||
currentContainer.push(values[valueIndex++])
|
||||
} else if (isNullable) {
|
||||
if (def) {
|
||||
// TODO: Go up maxDefinitionLevel - def - 1 levels to add null
|
||||
for (let j = def; j < maxDefinitionLevel - 1; j++) {
|
||||
containerStack.pop()
|
||||
// @ts-expect-error won't be empty
|
||||
currentContainer = containerStack.at(-1)
|
||||
}
|
||||
if (def > 1) {
|
||||
currentContainer.push(undefined)
|
||||
}
|
||||
} else {
|
||||
// TODO: actually depends on level required or not
|
||||
if (def % 2 === 0) {
|
||||
currentContainer.push(undefined)
|
||||
} else {
|
||||
currentContainer.push([])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -182,10 +182,14 @@ async function readRowGroup(options, rowGroup, groupStart) {
|
||||
const obj = {}
|
||||
for (let j = 0; j < keys[i].length; j++) {
|
||||
if (Array.isArray(keys[i][j])) {
|
||||
// TODO: key should not be an array, this is an assemble bug
|
||||
// TODO: key should not be an array, this is an assemble bug?
|
||||
keys[i][j] = keys[i][j][0]
|
||||
values[i][j] = values[i][j][0]
|
||||
}
|
||||
if (keys[i][j] instanceof Uint8Array) {
|
||||
// decode utf-8 keys
|
||||
keys[i][j] = new TextDecoder().decode(keys[i][j])
|
||||
}
|
||||
if (!keys[i][j]) continue
|
||||
obj[keys[i][j]] = values[i][j] === undefined ? null : values[i][j]
|
||||
}
|
||||
|
||||
@ -83,14 +83,14 @@ describe('assembleObjects', () => {
|
||||
expect(result).toEqual([[[]]])
|
||||
})
|
||||
|
||||
it('should handle isNull', () => {
|
||||
it('should handle nonnullable lists', () => {
|
||||
// from nonnullable.impala.parquet
|
||||
const result = assembleObjects([2], [0], [-1], false, 2, 2)
|
||||
expect(result).toEqual([[[-1]]])
|
||||
})
|
||||
|
||||
it('should handle nullable int_array', () => {
|
||||
// from nullable.impala.parquet
|
||||
// from nullable.impala.parquet int_array
|
||||
// [1 2 3][N 1 2 N 3 N][ ] N N
|
||||
const definitionLevels = [3, 3, 3, 2, 3, 3, 2, 3, 2, 1, 0, 0]
|
||||
const repetitionLevels = [0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0]
|
||||
@ -104,4 +104,22 @@ describe('assembleObjects', () => {
|
||||
undefined,
|
||||
])
|
||||
})
|
||||
|
||||
it('should handle nullable int_array_Array', () => {
|
||||
// from nullable.impala.parquet int_array_Array
|
||||
// [1 2][3 4][[N 1 2 N][3 N 4] [] N][N] [] N N [N 5 6]
|
||||
const definitionLevels = [5, 5, 5, 5, 4, 5, 5, 4, 5, 4, 5, 3, 2, 2, 1, 0, 0, 2, 5, 5]
|
||||
const repetitionLevels = [0, 2, 1, 2, 0, 2, 2, 2, 1, 2, 2, 1, 1, 0, 0, 0, 0, 0, 1, 2]
|
||||
const values = [1, 2, 3, 4, 1, 2, 3, 4, 5, 6]
|
||||
const result = assembleObjects(definitionLevels, repetitionLevels, values, true, 5, 2)
|
||||
expect(result).toEqual([
|
||||
[[1, 2], [3, 4]],
|
||||
[[undefined, 1, 2, undefined], [3, undefined, 4], [], undefined],
|
||||
[undefined],
|
||||
[],
|
||||
undefined,
|
||||
undefined,
|
||||
[undefined, [5, 6]],
|
||||
])
|
||||
})
|
||||
})
|
||||
|
||||
Loading…
Reference in New Issue
Block a user