diff --git a/README.md b/README.md index d796148..1973540 100644 --- a/README.md +++ b/README.md @@ -202,6 +202,7 @@ Parsing a [420mb wikipedia parquet file](https://huggingface.co/datasets/wikimed - https://github.com/apache/thrift - https://github.com/apache/arrow - https://github.com/dask/fastparquet + - https://github.com/duckdb/duckdb - https://github.com/google/snappy - https://github.com/ironSource/parquetjs - https://github.com/zhipeng-jia/snappyjs diff --git a/src/assemble.js b/src/assemble.js index 0e15bf0..fbe5a98 100644 --- a/src/assemble.js +++ b/src/assemble.js @@ -2,11 +2,10 @@ * Dremel-assembly of arrays of values into lists * * Reconstructs a complex nested structure from flat arrays of definition and repetition levels, - * according to Dremel encoding. This simplified version focuses on arrays and scalar values, - * with optional support for null values. + * according to Dremel encoding. * - * @param {number[] | undefined} definitionLevels definition levels, max 3 - * @param {number[]} repetitionLevels repetition levels, max 1 + * @param {number[] | undefined} definitionLevels definition levels + * @param {number[]} repetitionLevels repetition levels * @param {ArrayLike} values values to process * @param {boolean} isNullable can entries be null? * @param {number} maxDefinitionLevel definition level that corresponds to non-null @@ -36,36 +35,27 @@ export function assembleObjects( // Construct new lists up to max repetition level // @ts-expect-error won't be empty currentContainer = containerStack.at(-1) - if (def) { - for (let j = rep; j < maxRepetitionLevel; j++) { - /** @type {any[]} */ - const newList = [] - currentContainer.push(newList) - currentContainer = newList - containerStack.push(newList) - } - } + } + + // Add lists up to definition level + const targetDepth = isNullable ? (def + 1) / 2 : maxRepetitionLevel + 1 + for (let j = containerStack.length; j < targetDepth; j++) { + /** @type {any[]} */ + const newList = [] + currentContainer.push(newList) + currentContainer = newList + containerStack.push(newList) } // Add value or null based on definition level if (def === maxDefinitionLevel) { - if (!currentContainer) { - throw new Error('parquet assembleObjects: currentContainer is undefined') - } currentContainer.push(values[valueIndex++]) } else if (isNullable) { - if (def) { - // TODO: Go up maxDefinitionLevel - def - 1 levels to add null - for (let j = def; j < maxDefinitionLevel - 1; j++) { - containerStack.pop() - // @ts-expect-error won't be empty - currentContainer = containerStack.at(-1) - } - if (def > 1) { - currentContainer.push(undefined) - } - } else { + // TODO: actually depends on level required or not + if (def % 2 === 0) { currentContainer.push(undefined) + } else { + currentContainer.push([]) } } } diff --git a/src/read.js b/src/read.js index 856a5e3..debde57 100644 --- a/src/read.js +++ b/src/read.js @@ -182,10 +182,14 @@ async function readRowGroup(options, rowGroup, groupStart) { const obj = {} for (let j = 0; j < keys[i].length; j++) { if (Array.isArray(keys[i][j])) { - // TODO: key should not be an array, this is an assemble bug + // TODO: key should not be an array, this is an assemble bug? keys[i][j] = keys[i][j][0] values[i][j] = values[i][j][0] } + if (keys[i][j] instanceof Uint8Array) { + // decode utf-8 keys + keys[i][j] = new TextDecoder().decode(keys[i][j]) + } if (!keys[i][j]) continue obj[keys[i][j]] = values[i][j] === undefined ? null : values[i][j] } diff --git a/test/assemble.test.js b/test/assemble.test.js index ae2a8eb..a817157 100644 --- a/test/assemble.test.js +++ b/test/assemble.test.js @@ -83,14 +83,14 @@ describe('assembleObjects', () => { expect(result).toEqual([[[]]]) }) - it('should handle isNull', () => { + it('should handle nonnullable lists', () => { // from nonnullable.impala.parquet const result = assembleObjects([2], [0], [-1], false, 2, 2) expect(result).toEqual([[[-1]]]) }) it('should handle nullable int_array', () => { - // from nullable.impala.parquet + // from nullable.impala.parquet int_array // [1 2 3][N 1 2 N 3 N][ ] N N const definitionLevels = [3, 3, 3, 2, 3, 3, 2, 3, 2, 1, 0, 0] const repetitionLevels = [0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0] @@ -104,4 +104,22 @@ describe('assembleObjects', () => { undefined, ]) }) + + it('should handle nullable int_array_Array', () => { + // from nullable.impala.parquet int_array_Array + // [1 2][3 4][[N 1 2 N][3 N 4] [] N][N] [] N N [N 5 6] + const definitionLevels = [5, 5, 5, 5, 4, 5, 5, 4, 5, 4, 5, 3, 2, 2, 1, 0, 0, 2, 5, 5] + const repetitionLevels = [0, 2, 1, 2, 0, 2, 2, 2, 1, 2, 2, 1, 1, 0, 0, 0, 0, 0, 1, 2] + const values = [1, 2, 3, 4, 1, 2, 3, 4, 5, 6] + const result = assembleObjects(definitionLevels, repetitionLevels, values, true, 5, 2) + expect(result).toEqual([ + [[1, 2], [3, 4]], + [[undefined, 1, 2, undefined], [3, undefined, 4], [], undefined], + [undefined], + [], + undefined, + undefined, + [undefined, [5, 6]], + ]) + }) })