Convert dictionary before dereferencing, and check encoding

This commit is contained in:
Kenny Daniel 2024-05-24 14:02:14 -07:00
parent f4877dcd49
commit 9aebdb2917
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
2 changed files with 36 additions and 32 deletions

@ -1,5 +1,5 @@
import { assembleLists } from './assemble.js'
import { convert, dereferenceDictionary } from './convert.js'
import { convertWithDictionary } from './convert.js'
import { readDataPage, readDictionaryPage } from './datapage.js'
import { readDataPageV2 } from './datapageV2.js'
import { parquetHeader } from './header.js'
@ -49,11 +49,9 @@ export function readColumn(reader, rowGroup, columnMetadata, schemaPath, { compr
seen += daph.num_values
// assert(!daph.statistics || daph.statistics.null_count === BigInt(daph.num_values - dataPage.length))
// construct output values: skip nulls and construct lists
values = dereferenceDictionary(dictionary, dataPage)
values = convert(values, element, utf8)
// convert types, dereference dictionary, and assemble lists
values = convertWithDictionary(dataPage, dictionary, element, daph.encoding, utf8)
if (repetitionLevels.length || definitionLevels?.length) {
// Use repetition levels to construct lists
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath)
const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath)
const repetitionPath = schemaPath.map(({ element }) => element.repetition_type)
@ -79,10 +77,9 @@ export function readColumn(reader, rowGroup, columnMetadata, schemaPath, { compr
)
seen += daph2.num_values
values = dereferenceDictionary(dictionary, dataPage)
values = convert(values, element, utf8)
// convert types, dereference dictionary, and assemble lists
values = convertWithDictionary(dataPage, dictionary, element, daph2.encoding, utf8)
if (repetitionLevels.length || definitionLevels?.length) {
// Use repetition levels to construct lists
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath)
const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath)
const repetitionPath = schemaPath.map(({ element }) => element.repetition_type)

@ -1,11 +1,40 @@
const dayMillis = 86400000 // 1 day in milliseconds
/**
* Convert known types from primitive to rich, and dereference dictionary.
*
* @typedef {import('./types.js').DecodedArray} DecodedArray
* @typedef {import('./types.js').SchemaElement} SchemaElement
* @param {DecodedArray} data series of primitive types
* @param {DecodedArray | undefined} dictionary
* @param {SchemaElement} schemaElement
* @param {import('./types.js').Encoding} encoding
* @param {boolean | undefined} utf8 decode bytes as utf8?
* @returns {DecodedArray} series of rich types
*/
export function convertWithDictionary(data, dictionary, schemaElement, encoding, utf8 = true) {
if (dictionary && encoding.endsWith('_DICTIONARY')) {
// convert dictionary
dictionary = convert(dictionary, schemaElement, utf8)
let output = data
if (data instanceof Uint8Array && !(dictionary instanceof Uint8Array)) {
// @ts-expect-error upgrade data to match dictionary type with fancy constructor
output = new dictionary.constructor(data.length)
}
for (let i = 0; i < data.length; i++) {
output[i] = dictionary[data[i]]
}
return output
} else {
return convert(data, schemaElement, utf8)
}
}
/**
* Convert known types from primitive to rich.
*
* @typedef {import('./types.js').DecodedArray} DecodedArray
* @param {DecodedArray} data series of primitive types
* @param {import('./types.js').SchemaElement} schemaElement schema element for the data
* @param {SchemaElement} schemaElement
* @param {boolean | undefined} utf8 decode bytes as utf8?
* @returns {DecodedArray} series of rich types
*/
@ -125,25 +154,3 @@ export function parseFloat16(bytes) {
if (exp === 0x1f) return frac ? NaN : sign * Infinity
return sign * Math.pow(2, exp - 15) * (1 + frac / 1024)
}
/**
* Map data to dictionary values in place.
*
* @param {DecodedArray | undefined} dictionary
* @param {DecodedArray} dataPage
* @returns {DecodedArray}
*/
export function dereferenceDictionary(dictionary, dataPage) {
let output = dataPage
if (dictionary) {
if (dataPage instanceof Uint8Array && !(dictionary instanceof Uint8Array)) {
// upgrade dataPage to match dictionary type
// @ts-expect-error not my fault typescript doesn't understand constructors
output = new dictionary.constructor(dataPage.length)
}
for (let i = 0; i < dataPage.length; i++) {
output[i] = dictionary[dataPage[i]]
}
}
return output
}