diff --git a/src/column.js b/src/column.js index 83a8535..b909a88 100644 --- a/src/column.js +++ b/src/column.js @@ -27,6 +27,7 @@ export function readColumn(arrayBuffer, rowGroup, columnMetadata, schema) { // parse column data let valuesSeen = 0 let byteOffset = 0 // byteOffset within the column + /** @type {ArrayLike | undefined} */ let dictionary = undefined const rowIndex = [0] // map/list object index const rowData = [] @@ -65,18 +66,23 @@ export function readColumn(arrayBuffer, rowGroup, columnMetadata, schema) { const { definitionLevels, repetitionLevels, value: dataPage } = readDataPage(page, daph, schema, columnMetadata) valuesSeen += daph.num_values + const dictionaryEncoding = daph.encoding === Encoding.PLAIN_DICTIONARY || daph.encoding === Encoding.RLE_DICTIONARY + // construct output values: skip nulls and construct lists let values if (repetitionLevels.length) { // Use repetition levels to construct lists - if ([Encoding.PLAIN_DICTIONARY, Encoding.RLE_DICTIONARY].includes(daph.encoding)) { - // TODO: dereference dictionary values + if (dictionaryEncoding && dictionary !== undefined && Array.isArray(dataPage)) { + // dereference dictionary values + for (let i = 0; i < dataPage.length; i++) { + dataPage[i] = dictionary[dataPage[i]] + } } const isNull = columnMetadata && !isRequired(schema, [columnMetadata.path_in_schema[0]]) const nullValue = false // TODO: unused? const maxDefinitionLevel = getMaxDefinitionLevel(schema, columnMetadata.path_in_schema) values = assembleObjects(definitionLevels, repetitionLevels, dataPage, isNull, nullValue, maxDefinitionLevel, rowIndex[0]) - } else if (definitionLevels) { + } else if (definitionLevels?.length) { const maxDefinitionLevel = getMaxDefinitionLevel(schema, columnMetadata.path_in_schema) // Use definition levels to skip nulls let index = 0 @@ -105,7 +111,12 @@ export function readColumn(arrayBuffer, rowGroup, columnMetadata, schema) { } } } else { - // TODO: use dictionary + if (dictionaryEncoding && dictionary !== undefined && Array.isArray(dataPage)) { + // dereference dictionary values + for (let i = 0; i < dataPage.length; i++) { + dataPage[i] = dictionary[dataPage[i]] + } + } values = dataPage } diff --git a/src/datapage.js b/src/datapage.js index eb1c91b..9867b34 100644 --- a/src/datapage.js +++ b/src/datapage.js @@ -23,7 +23,7 @@ const skipNulls = false // TODO * @param {DataPageHeader} daph data page header * @param {SchemaElement[]} schema schema for the file * @param {ColumnMetaData} columnMetadata metadata for the column - * @returns {DataPage} array of values + * @returns {DataPage} definition levels, repetition levels, and array of values */ export function readDataPage(bytes, daph, schema, columnMetadata) { const dataView = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength) @@ -142,8 +142,16 @@ function readDefinitionLevels(dataView, offset, daph, schema, columnMetadata) { const { value: definitionLevels, byteLength } = readData( dataView, daph.definition_level_encoding, offset, daph.num_values, bitWidth ) - const numNulls = daph.num_values - definitionLevels - .filter((/** @type number */ d) => d === maxDefinitionLevel).length + + // count nulls + let numNulls = daph.num_values + for (const def of definitionLevels) { + if (def === maxDefinitionLevel) numNulls-- + } + if (numNulls === 0) { + definitionLevels.length = 0 + } + return { byteLength, definitionLevels, numNulls } } } diff --git a/src/encoding.js b/src/encoding.js index 93aae97..948a96a 100644 --- a/src/encoding.js +++ b/src/encoding.js @@ -206,7 +206,7 @@ export function readData(dataView, encoding, offset, count, bitWidth) { if (encoding === ParquetEncoding.RLE) { let seen = 0 while (seen < count) { - const { value: rleValues, byteLength: rleByteLength } = readRleBitPackedHybrid(dataView, offset + byteLength, bitWidth, 0, count) + const { value: rleValues, byteLength: rleByteLength } = readRleBitPackedHybrid(dataView, offset + byteLength, bitWidth, 0, 1) if (!rleValues.length) break // EOF value.push(...rleValues) seen += rleValues.length @@ -220,13 +220,13 @@ export function readData(dataView, encoding, offset, count, bitWidth) { /** * Read values from a run-length encoded/bit-packed hybrid encoding. - * If length is not specified, then a 32-bit int is read first to grab the - * length of the encoded data. + * + * If length is zero, then read as int32 at the start of the encoded data. * * @param {DataView} dataView - buffer to read data from * @param {number} offset - offset to start reading from the DataView * @param {number} width - width of each bit-packed group - * @param {number | undefined} length - length of the encoded data + * @param {number} length - length of the encoded data * @param {number} numValues - number of values to read * @returns {Decoded} array of rle/bit-packed values */ diff --git a/src/read.js b/src/read.js index d4648ec..47780ff 100644 --- a/src/read.js +++ b/src/read.js @@ -113,11 +113,13 @@ async function readRowGroup(options, rowGroup) { const columnStartByte = getColumnOffset(columnMetadata) const columnEndByte = columnStartByte + Number(columnMetadata.total_compressed_size) const columnBytes = columnEndByte - columnStartByte + // skip columns larger than 1gb if (columnBytes > 1 << 30) { console.warn(`parquet skipping huge column "${columnMetadata.path_in_schema}" ${columnBytes.toLocaleString()} bytes`) continue } + // use pre-loaded row group byte data if available, else read column data let buffer if (!groupBuffer) { @@ -127,6 +129,7 @@ async function readRowGroup(options, rowGroup) { } else { buffer = Promise.resolve(groupBuffer) } + // read column data async promises.push(buffer.then(arrayBuffer => { // TODO: extract SchemaElement for this column diff --git a/test/encoding.test.js b/test/encoding.test.js index c2c0d29..b89d14b 100644 --- a/test/encoding.test.js +++ b/test/encoding.test.js @@ -117,7 +117,7 @@ describe('readRleBitPackedHybrid', () => { dataView.setUint8(6, 0b00000011) // Bit-packed header for 3 values dataView.setUint8(7, 0b00000100) // Bit-packed values (false, false, true) - const { byteLength, value } = readRleBitPackedHybrid(dataView, 0, 1, undefined, 6) + const { byteLength, value } = readRleBitPackedHybrid(dataView, 0, 1, 0, 6) expect(byteLength).toBe(8) expect(value).toEqual([1, 1, 1, 0, 0, 1]) })