mirror of
https://github.com/asadbek064/hyparquet.git
synced 2026-01-01 01:36:38 +00:00
Convert dictionary before dereferencing, and check encoding
This commit is contained in:
parent
f4877dcd49
commit
9aebdb2917
@ -1,5 +1,5 @@
|
||||
import { assembleLists } from './assemble.js'
|
||||
import { convert, dereferenceDictionary } from './convert.js'
|
||||
import { convertWithDictionary } from './convert.js'
|
||||
import { readDataPage, readDictionaryPage } from './datapage.js'
|
||||
import { readDataPageV2 } from './datapageV2.js'
|
||||
import { parquetHeader } from './header.js'
|
||||
@ -49,11 +49,9 @@ export function readColumn(reader, rowGroup, columnMetadata, schemaPath, { compr
|
||||
seen += daph.num_values
|
||||
// assert(!daph.statistics || daph.statistics.null_count === BigInt(daph.num_values - dataPage.length))
|
||||
|
||||
// construct output values: skip nulls and construct lists
|
||||
values = dereferenceDictionary(dictionary, dataPage)
|
||||
values = convert(values, element, utf8)
|
||||
// convert types, dereference dictionary, and assemble lists
|
||||
values = convertWithDictionary(dataPage, dictionary, element, daph.encoding, utf8)
|
||||
if (repetitionLevels.length || definitionLevels?.length) {
|
||||
// Use repetition levels to construct lists
|
||||
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath)
|
||||
const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath)
|
||||
const repetitionPath = schemaPath.map(({ element }) => element.repetition_type)
|
||||
@ -79,10 +77,9 @@ export function readColumn(reader, rowGroup, columnMetadata, schemaPath, { compr
|
||||
)
|
||||
seen += daph2.num_values
|
||||
|
||||
values = dereferenceDictionary(dictionary, dataPage)
|
||||
values = convert(values, element, utf8)
|
||||
// convert types, dereference dictionary, and assemble lists
|
||||
values = convertWithDictionary(dataPage, dictionary, element, daph2.encoding, utf8)
|
||||
if (repetitionLevels.length || definitionLevels?.length) {
|
||||
// Use repetition levels to construct lists
|
||||
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath)
|
||||
const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath)
|
||||
const repetitionPath = schemaPath.map(({ element }) => element.repetition_type)
|
||||
|
||||
@ -1,11 +1,40 @@
|
||||
const dayMillis = 86400000 // 1 day in milliseconds
|
||||
|
||||
/**
|
||||
* Convert known types from primitive to rich, and dereference dictionary.
|
||||
*
|
||||
* @typedef {import('./types.js').DecodedArray} DecodedArray
|
||||
* @typedef {import('./types.js').SchemaElement} SchemaElement
|
||||
* @param {DecodedArray} data series of primitive types
|
||||
* @param {DecodedArray | undefined} dictionary
|
||||
* @param {SchemaElement} schemaElement
|
||||
* @param {import('./types.js').Encoding} encoding
|
||||
* @param {boolean | undefined} utf8 decode bytes as utf8?
|
||||
* @returns {DecodedArray} series of rich types
|
||||
*/
|
||||
export function convertWithDictionary(data, dictionary, schemaElement, encoding, utf8 = true) {
|
||||
if (dictionary && encoding.endsWith('_DICTIONARY')) {
|
||||
// convert dictionary
|
||||
dictionary = convert(dictionary, schemaElement, utf8)
|
||||
let output = data
|
||||
if (data instanceof Uint8Array && !(dictionary instanceof Uint8Array)) {
|
||||
// @ts-expect-error upgrade data to match dictionary type with fancy constructor
|
||||
output = new dictionary.constructor(data.length)
|
||||
}
|
||||
for (let i = 0; i < data.length; i++) {
|
||||
output[i] = dictionary[data[i]]
|
||||
}
|
||||
return output
|
||||
} else {
|
||||
return convert(data, schemaElement, utf8)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert known types from primitive to rich.
|
||||
*
|
||||
* @typedef {import('./types.js').DecodedArray} DecodedArray
|
||||
* @param {DecodedArray} data series of primitive types
|
||||
* @param {import('./types.js').SchemaElement} schemaElement schema element for the data
|
||||
* @param {SchemaElement} schemaElement
|
||||
* @param {boolean | undefined} utf8 decode bytes as utf8?
|
||||
* @returns {DecodedArray} series of rich types
|
||||
*/
|
||||
@ -125,25 +154,3 @@ export function parseFloat16(bytes) {
|
||||
if (exp === 0x1f) return frac ? NaN : sign * Infinity
|
||||
return sign * Math.pow(2, exp - 15) * (1 + frac / 1024)
|
||||
}
|
||||
|
||||
/**
|
||||
* Map data to dictionary values in place.
|
||||
*
|
||||
* @param {DecodedArray | undefined} dictionary
|
||||
* @param {DecodedArray} dataPage
|
||||
* @returns {DecodedArray}
|
||||
*/
|
||||
export function dereferenceDictionary(dictionary, dataPage) {
|
||||
let output = dataPage
|
||||
if (dictionary) {
|
||||
if (dataPage instanceof Uint8Array && !(dictionary instanceof Uint8Array)) {
|
||||
// upgrade dataPage to match dictionary type
|
||||
// @ts-expect-error not my fault typescript doesn't understand constructors
|
||||
output = new dictionary.constructor(dataPage.length)
|
||||
}
|
||||
for (let i = 0; i < dataPage.length; i++) {
|
||||
output[i] = dictionary[dataPage[i]]
|
||||
}
|
||||
}
|
||||
return output
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user