From 52721a3d3014465c71f0bbb46bf21b2624bccad2 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Mon, 18 Mar 2024 16:36:16 -0700 Subject: [PATCH] Split out assemble objects --- package.json | 8 +++--- src/assemble.js | 60 +++++++++++++++++++++++++++++++++++++++ src/column.js | 9 +++--- src/datapage.js | 65 ------------------------------------------- test/assemble.test.js | 48 ++++++++++++++++++++++++++++++++ 5 files changed, 116 insertions(+), 74 deletions(-) create mode 100644 src/assemble.js create mode 100644 test/assemble.test.js diff --git a/package.json b/package.json index 14009e1..de7dfb4 100644 --- a/package.json +++ b/package.json @@ -27,15 +27,15 @@ "typecheck": "tsc" }, "devDependencies": { - "@types/node": "20.11.27", - "@typescript-eslint/eslint-plugin": "7.2.0", - "@vitest/coverage-v8": "1.3.1", + "@types/node": "20.11.29", + "@typescript-eslint/eslint-plugin": "7.3.1", + "@vitest/coverage-v8": "1.4.0", "eslint": "8.57.0", "eslint-plugin-import": "2.29.1", "eslint-plugin-jsdoc": "48.2.1", "http-server": "14.1.1", "hysnappy": "0.3.0", "typescript": "5.4.2", - "vitest": "1.3.1" + "vitest": "1.4.0" } } diff --git a/src/assemble.js b/src/assemble.js new file mode 100644 index 0000000..0892716 --- /dev/null +++ b/src/assemble.js @@ -0,0 +1,60 @@ +/** + * Dremel-assembly of arrays of values into lists + * + * @param {number[] | undefined} definitionLevels definition levels, max 3 + * @param {number[]} repetitionLevels repetition levels, max 1 + * @param {ArrayLike} value values to process + * @param {boolean} isNull can an entry be null? + * @param {number} maxDefinitionLevel definition level that corresponds to non-null + * @returns {any[]} array of values + */ +export function assembleObjects( + definitionLevels, repetitionLevels, value, isNull, maxDefinitionLevel +) { + let vali = 0 + let started = false + let haveNull = false + let i = 0 + let part = [] + /** @type {any[]} */ + const assign = [] + + for (let counter = 0; counter < repetitionLevels.length; counter++) { + const def = definitionLevels?.length ? definitionLevels[counter] : maxDefinitionLevel + const rep = repetitionLevels[counter] + + if (!rep) { + // new row - save what we have + if (started) { + assign[i] = haveNull ? undefined : part + part = [] + i++ + } else { + // first time: no row to save yet, unless it's a row continued from previous page + if (vali > 0) { + assign[i - 1] = assign[i - 1]?.concat(part) // add items to previous row + part = [] + // don't increment i since we only filled i-1 + } + started = true + } + } + + if (def === maxDefinitionLevel) { + // append real value to current item + part.push(value[vali]) + vali++ + } else if (def > 0) { + // append null to current item + part.push(undefined) + } + + haveNull = def === 0 && isNull + } + + if (started) { + assign[i] = haveNull ? undefined : part + } + + return assign +} diff --git a/src/column.js b/src/column.js index a914408..b4af997 100644 --- a/src/column.js +++ b/src/column.js @@ -1,6 +1,7 @@ +import { assembleObjects } from './assemble.js' import { PageType } from './constants.js' import { convert } from './convert.js' -import { assembleObjects, readDataPage, readDictionaryPage } from './datapage.js' +import { readDataPage, readDictionaryPage } from './datapage.js' import { readDataPageV2 } from './datapageV2.js' import { parquetHeader } from './header.js' import { getMaxDefinitionLevel, isRequired, schemaElement } from './schema.js' @@ -29,7 +30,6 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata, let dictionary = undefined let valuesSeen = 0 let byteOffset = 0 // byteOffset within the column - const rowIndex = [0] // map/list object index const rowData = [] while (valuesSeen < rowGroup.num_rows) { @@ -66,10 +66,9 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata, dereferenceDictionary(dictionary, dataPage) // Use repetition levels to construct lists const isNull = columnMetadata && !isRequired(schema, [columnMetadata.path_in_schema[0]]) - const nullValue = false // TODO: unused? const maxDefinitionLevel = getMaxDefinitionLevel(schema, columnMetadata.path_in_schema) values = assembleObjects( - definitionLevels, repetitionLevels, dataPage, isNull, nullValue, maxDefinitionLevel, rowIndex[0] + definitionLevels, repetitionLevels, dataPage, isNull, maxDefinitionLevel ) } else if (definitionLevels?.length) { const maxDefinitionLevel = getMaxDefinitionLevel(schema, columnMetadata.path_in_schema) @@ -115,7 +114,7 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata, dereferenceDictionary(dictionary, dataPage) // Use repetition levels to construct lists rowData.push(...assembleObjects( - definitionLevels, repetitionLevels, dataPage, true, false, maxDefinitionLevel, rowIndex[0] + definitionLevels, repetitionLevels, dataPage, true, maxDefinitionLevel )) } else if (daph2.num_nulls) { // skip nulls diff --git a/src/datapage.js b/src/datapage.js index 4613b22..43f8ba2 100644 --- a/src/datapage.js +++ b/src/datapage.js @@ -168,68 +168,3 @@ function readDefinitionLevels(dataView, offset, daph, schema, path_in_schema) { } return { byteLength: 0, definitionLevels: [], numNulls: 0 } } - -/** - * Dremel-assembly of arrays of values into lists - * - * @param {number[] | undefined} definitionLevels definition levels, max 3 - * @param {number[]} repetitionLevels repetition levels, max 1 - * @param {ArrayLike} value values to process - * @param {boolean} isNull can an entry be null? - * @param {boolean} nullValue can list elements be null? - * @param {number} maxDefinitionLevel definition level that corresponds to non-null - * @param {number} prevIndex 1 + index where the last row in the previous page was inserted (0 if first page) - * @returns {any[]} array of values - */ -export function assembleObjects( - definitionLevels, repetitionLevels, value, isNull, nullValue, maxDefinitionLevel, prevIndex -) { - let vali = 0 - let started = false - let haveNull = false - let i = prevIndex - let part = [] - /** @type {any[]} */ - const assign = [] - - for (let counter = 0; counter < repetitionLevels.length; counter++) { - const def = definitionLevels?.length ? definitionLevels[counter] : maxDefinitionLevel - const rep = repetitionLevels[counter] - - if (!rep) { - // new row - save what we have - if (started) { - assign[i] = haveNull ? undefined : part - part = [] - i++ - } else { - // first time: no row to save yet, unless it's a row continued from previous page - if (vali > 0) { - assign[i - 1] = assign[i - 1]?.concat(part) // add items to previous row - part = [] - // don't increment i since we only filled i-1 - } - started = true - } - } - - if (def === maxDefinitionLevel) { - // append real value to current item - part.push(value[vali]) - vali++ - } else if (def > 0) { - // append null to current item - part.push(undefined) - } - - haveNull = def === 0 && isNull - } - - if (started) { - assign[i] = haveNull ? undefined : part - } else if (vali > 0) { - assign[i - 1] = assign[i - 1]?.concat(part) - } - - return assign -} diff --git a/test/assemble.test.js b/test/assemble.test.js new file mode 100644 index 0000000..0a7fd2f --- /dev/null +++ b/test/assemble.test.js @@ -0,0 +1,48 @@ +import { describe, expect, it } from 'vitest' +import { assembleObjects } from '../src/assemble.js' + +describe('assembleObjects', () => { + it('should assemble objects with non-null values', () => { + const repetitionLevels = [0, 1] + const values = ['a', 'b'] + const result = assembleObjects([], repetitionLevels, values, false, 3) + expect(result).toEqual([['a', 'b']]) + }) + + it('should handle null values', () => { + const definitionLevels = [3, 0, 3] + const repetitionLevels = [0, 0, 1] + const values = ['a', 'c'] + const result = assembleObjects(definitionLevels, repetitionLevels, values, true, 3) + expect(result).toEqual([['a'], ['c']]) + }) + + it('should handle empty lists', () => { + const result = assembleObjects([], [], [], false, 3) + expect(result).toEqual([]) + }) + + it('should handle multiple lists', () => { + const definitionLevels = [3, 3, 3, 3, 3, 3] + const repetitionLevels = [0, 1, 1, 0, 1, 1] + const values = [1, 2, 3, 4, 5, 6] + const result = assembleObjects(definitionLevels, repetitionLevels, values, false, 3) + expect(result).toEqual([[1, 2, 3], [4, 5, 6]]) + }) + + it('should assemble multiple lists with nulls', () => { + const definitionLevels = [3, 3, 0, 3, 3] + const repetitionLevels = [0, 1, 0, 0, 1] + const values = ['a', 'b', 'd', 'e'] + const result = assembleObjects(definitionLevels, repetitionLevels, values, true, 3) + expect(result).toEqual([['a', 'b'], undefined, ['d', 'e']]) + }) + + it('should handle continuing a row from the previous page', () => { + const definitionLevels = [3, 3, 3, 1] + const repetitionLevels = [1, 0, 1, 0] + const values = ['a', 'b', 'c', 'd'] + const result = assembleObjects(definitionLevels, repetitionLevels, values, false, 3) + expect(result).toEqual([['b', 'c'], [undefined]]) + }) +})