Split out assemble objects

This commit is contained in:
Kenny Daniel 2024-03-18 16:36:16 -07:00
parent 4654c5eddf
commit 52721a3d30
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
5 changed files with 116 additions and 74 deletions

@ -27,15 +27,15 @@
"typecheck": "tsc"
},
"devDependencies": {
"@types/node": "20.11.27",
"@typescript-eslint/eslint-plugin": "7.2.0",
"@vitest/coverage-v8": "1.3.1",
"@types/node": "20.11.29",
"@typescript-eslint/eslint-plugin": "7.3.1",
"@vitest/coverage-v8": "1.4.0",
"eslint": "8.57.0",
"eslint-plugin-import": "2.29.1",
"eslint-plugin-jsdoc": "48.2.1",
"http-server": "14.1.1",
"hysnappy": "0.3.0",
"typescript": "5.4.2",
"vitest": "1.3.1"
"vitest": "1.4.0"
}
}

60
src/assemble.js Normal file

@ -0,0 +1,60 @@
/**
* Dremel-assembly of arrays of values into lists
*
* @param {number[] | undefined} definitionLevels definition levels, max 3
* @param {number[]} repetitionLevels repetition levels, max 1
* @param {ArrayLike<any>} value values to process
* @param {boolean} isNull can an entry be null?
* @param {number} maxDefinitionLevel definition level that corresponds to non-null
* @returns {any[]} array of values
*/
export function assembleObjects(
definitionLevels, repetitionLevels, value, isNull, maxDefinitionLevel
) {
let vali = 0
let started = false
let haveNull = false
let i = 0
let part = []
/** @type {any[]} */
const assign = []
for (let counter = 0; counter < repetitionLevels.length; counter++) {
const def = definitionLevels?.length ? definitionLevels[counter] : maxDefinitionLevel
const rep = repetitionLevels[counter]
if (!rep) {
// new row - save what we have
if (started) {
assign[i] = haveNull ? undefined : part
part = []
i++
} else {
// first time: no row to save yet, unless it's a row continued from previous page
if (vali > 0) {
assign[i - 1] = assign[i - 1]?.concat(part) // add items to previous row
part = []
// don't increment i since we only filled i-1
}
started = true
}
}
if (def === maxDefinitionLevel) {
// append real value to current item
part.push(value[vali])
vali++
} else if (def > 0) {
// append null to current item
part.push(undefined)
}
haveNull = def === 0 && isNull
}
if (started) {
assign[i] = haveNull ? undefined : part
}
return assign
}

@ -1,6 +1,7 @@
import { assembleObjects } from './assemble.js'
import { PageType } from './constants.js'
import { convert } from './convert.js'
import { assembleObjects, readDataPage, readDictionaryPage } from './datapage.js'
import { readDataPage, readDictionaryPage } from './datapage.js'
import { readDataPageV2 } from './datapageV2.js'
import { parquetHeader } from './header.js'
import { getMaxDefinitionLevel, isRequired, schemaElement } from './schema.js'
@ -29,7 +30,6 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata,
let dictionary = undefined
let valuesSeen = 0
let byteOffset = 0 // byteOffset within the column
const rowIndex = [0] // map/list object index
const rowData = []
while (valuesSeen < rowGroup.num_rows) {
@ -66,10 +66,9 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata,
dereferenceDictionary(dictionary, dataPage)
// Use repetition levels to construct lists
const isNull = columnMetadata && !isRequired(schema, [columnMetadata.path_in_schema[0]])
const nullValue = false // TODO: unused?
const maxDefinitionLevel = getMaxDefinitionLevel(schema, columnMetadata.path_in_schema)
values = assembleObjects(
definitionLevels, repetitionLevels, dataPage, isNull, nullValue, maxDefinitionLevel, rowIndex[0]
definitionLevels, repetitionLevels, dataPage, isNull, maxDefinitionLevel
)
} else if (definitionLevels?.length) {
const maxDefinitionLevel = getMaxDefinitionLevel(schema, columnMetadata.path_in_schema)
@ -115,7 +114,7 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata,
dereferenceDictionary(dictionary, dataPage)
// Use repetition levels to construct lists
rowData.push(...assembleObjects(
definitionLevels, repetitionLevels, dataPage, true, false, maxDefinitionLevel, rowIndex[0]
definitionLevels, repetitionLevels, dataPage, true, maxDefinitionLevel
))
} else if (daph2.num_nulls) {
// skip nulls

@ -168,68 +168,3 @@ function readDefinitionLevels(dataView, offset, daph, schema, path_in_schema) {
}
return { byteLength: 0, definitionLevels: [], numNulls: 0 }
}
/**
* Dremel-assembly of arrays of values into lists
*
* @param {number[] | undefined} definitionLevels definition levels, max 3
* @param {number[]} repetitionLevels repetition levels, max 1
* @param {ArrayLike<any>} value values to process
* @param {boolean} isNull can an entry be null?
* @param {boolean} nullValue can list elements be null?
* @param {number} maxDefinitionLevel definition level that corresponds to non-null
* @param {number} prevIndex 1 + index where the last row in the previous page was inserted (0 if first page)
* @returns {any[]} array of values
*/
export function assembleObjects(
definitionLevels, repetitionLevels, value, isNull, nullValue, maxDefinitionLevel, prevIndex
) {
let vali = 0
let started = false
let haveNull = false
let i = prevIndex
let part = []
/** @type {any[]} */
const assign = []
for (let counter = 0; counter < repetitionLevels.length; counter++) {
const def = definitionLevels?.length ? definitionLevels[counter] : maxDefinitionLevel
const rep = repetitionLevels[counter]
if (!rep) {
// new row - save what we have
if (started) {
assign[i] = haveNull ? undefined : part
part = []
i++
} else {
// first time: no row to save yet, unless it's a row continued from previous page
if (vali > 0) {
assign[i - 1] = assign[i - 1]?.concat(part) // add items to previous row
part = []
// don't increment i since we only filled i-1
}
started = true
}
}
if (def === maxDefinitionLevel) {
// append real value to current item
part.push(value[vali])
vali++
} else if (def > 0) {
// append null to current item
part.push(undefined)
}
haveNull = def === 0 && isNull
}
if (started) {
assign[i] = haveNull ? undefined : part
} else if (vali > 0) {
assign[i - 1] = assign[i - 1]?.concat(part)
}
return assign
}

48
test/assemble.test.js Normal file

@ -0,0 +1,48 @@
import { describe, expect, it } from 'vitest'
import { assembleObjects } from '../src/assemble.js'
describe('assembleObjects', () => {
it('should assemble objects with non-null values', () => {
const repetitionLevels = [0, 1]
const values = ['a', 'b']
const result = assembleObjects([], repetitionLevels, values, false, 3)
expect(result).toEqual([['a', 'b']])
})
it('should handle null values', () => {
const definitionLevels = [3, 0, 3]
const repetitionLevels = [0, 0, 1]
const values = ['a', 'c']
const result = assembleObjects(definitionLevels, repetitionLevels, values, true, 3)
expect(result).toEqual([['a'], ['c']])
})
it('should handle empty lists', () => {
const result = assembleObjects([], [], [], false, 3)
expect(result).toEqual([])
})
it('should handle multiple lists', () => {
const definitionLevels = [3, 3, 3, 3, 3, 3]
const repetitionLevels = [0, 1, 1, 0, 1, 1]
const values = [1, 2, 3, 4, 5, 6]
const result = assembleObjects(definitionLevels, repetitionLevels, values, false, 3)
expect(result).toEqual([[1, 2, 3], [4, 5, 6]])
})
it('should assemble multiple lists with nulls', () => {
const definitionLevels = [3, 3, 0, 3, 3]
const repetitionLevels = [0, 1, 0, 0, 1]
const values = ['a', 'b', 'd', 'e']
const result = assembleObjects(definitionLevels, repetitionLevels, values, true, 3)
expect(result).toEqual([['a', 'b'], undefined, ['d', 'e']])
})
it('should handle continuing a row from the previous page', () => {
const definitionLevels = [3, 3, 3, 1]
const repetitionLevels = [1, 0, 1, 0]
const values = ['a', 'b', 'c', 'd']
const result = assembleObjects(definitionLevels, repetitionLevels, values, false, 3)
expect(result).toEqual([['b', 'c'], [undefined]])
})
})