diff --git a/src/datapage.js b/src/datapage.js index 7cca676..7a0cbeb 100644 --- a/src/datapage.js +++ b/src/datapage.js @@ -1,4 +1,4 @@ -import { readData, readPlain, readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js' +import { readPlain, readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js' import { getMaxDefinitionLevel, getMaxRepetitionLevel, isRequired, skipDefinitionBytes } from './schema.js' const skipNulls = false // TODO @@ -62,10 +62,8 @@ export function readDataPage(bytes, daph, schemaPath, columnMetadata) { reader.offset++ } if (bitWidth) { - const value = readRleBitPackedHybrid( - reader, bitWidth, view.byteLength - reader.offset, nValues - ) - values = Array.isArray(value) ? value : Array.from(value) + values = new Array(nValues) + readRleBitPackedHybrid(reader, bitWidth, view.byteLength - reader.offset, values) } else { // nval zeros values = new Array(nValues).fill(0) @@ -106,9 +104,9 @@ function readRepetitionLevels(reader, daph, schemaPath) { const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath) if (maxRepetitionLevel) { const bitWidth = widthFromMaxInt(maxRepetitionLevel) - return readData( - reader, daph.repetition_level_encoding, daph.num_values, bitWidth - ) + const values = new Array(daph.num_values) + readRleBitPackedHybrid(reader, bitWidth, 0, values) + return values } } return [] @@ -128,9 +126,8 @@ function readDefinitionLevels(reader, daph, schemaPath) { const bitWidth = widthFromMaxInt(maxDefinitionLevel) if (bitWidth) { // num_values is index 1 for either type of page header - const definitionLevels = readData( - reader, daph.definition_level_encoding, daph.num_values, bitWidth - ) + const definitionLevels = new Array(daph.num_values) + readRleBitPackedHybrid(reader, bitWidth, 0, definitionLevels) // count nulls let numNulls = daph.num_values diff --git a/src/datapageV2.js b/src/datapageV2.js index 84f5357..fc38f0c 100644 --- a/src/datapageV2.js +++ b/src/datapageV2.js @@ -64,9 +64,8 @@ export function readDataPageV2(compressedBytes, ph, schemaPath, columnMetadata, throw new Error('parquet RLE encoding with nulls not supported') } else { const pageReader = { view: pageView, offset: 4 } - values = readRleBitPackedHybrid( - pageReader, bitWidth, uncompressedPageSize, nValues - ) + values = new Array(nValues) + readRleBitPackedHybrid(pageReader, bitWidth, uncompressedPageSize, values) } } else if ( daph2.encoding === 'PLAIN_DICTIONARY' || @@ -77,10 +76,8 @@ export function readDataPageV2(compressedBytes, ph, schemaPath, columnMetadata, const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength) const bitWidth = pageView.getUint8(0) const pageReader = { view: pageView, offset: 1 } - const value = readRleBitPackedHybrid( - pageReader, bitWidth, uncompressedPageSize, nValues - ) - values = value + values = new Array(nValues) + readRleBitPackedHybrid(pageReader, bitWidth, uncompressedPageSize, values) } else if (daph2.encoding === 'DELTA_BINARY_PACKED') { if (daph2.num_nulls) throw new Error('parquet delta-int not supported') const codec = daph2.is_compressed ? columnMetadata.codec : 'UNCOMPRESSED' @@ -108,9 +105,11 @@ export function readRepetitionLevelsV2(reader, daph2, schemaPath) { const bitWidth = widthFromMaxInt(maxRepetitionLevel) // num_values is index 1 for either type of page header - return readRleBitPackedHybrid( - reader, bitWidth, daph2.repetition_levels_byte_length, daph2.num_values + const values = new Array(daph2.num_values) + readRleBitPackedHybrid( + reader, bitWidth, daph2.repetition_levels_byte_length, values ) + return values } /** @@ -125,9 +124,9 @@ function readDefinitionLevelsV2(reader, daph2, maxDefinitionLevel) { if (maxDefinitionLevel) { // not the same as V1, because we know the length const bitWidth = widthFromMaxInt(maxDefinitionLevel) - return readRleBitPackedHybrid( - reader, bitWidth, daph2.definition_levels_byte_length, daph2.num_values - ) + const values = new Array(daph2.num_values) + readRleBitPackedHybrid(reader, bitWidth, daph2.definition_levels_byte_length, values) + return values } } diff --git a/src/encoding.js b/src/encoding.js index 3cc0eac..bc5793c 100644 --- a/src/encoding.js +++ b/src/encoding.js @@ -185,33 +185,6 @@ export function widthFromMaxInt(value) { return Math.ceil(Math.log2(value + 1)) } -/** - * Read data from the file-object using the given encoding. - * The data could be definition levels, repetition levels, or actual values. - * - * @typedef {import("./types.d.ts").Encoding} Encoding - * @param {DataReader} reader - buffer to read data from - * @param {Encoding} encoding - encoding type - * @param {number} count - number of values to read - * @param {number} bitWidth - width of each bit-packed group - * @returns {any[]} array of values - */ -export function readData(reader, encoding, count, bitWidth) { - const values = new Array(count) - if (encoding === 'RLE') { - let seen = 0 - while (seen < count) { - const rle = readRleBitPackedHybrid(reader, bitWidth, 0, count) - if (!rle.length) break // EOF - splice(values, rle, seen) - seen += rle.length - } - } else { - throw new Error(`parquet encoding not supported ${encoding}`) - } - return values -} - /** * Read values from a run-length encoded/bit-packed hybrid encoding. * @@ -221,20 +194,17 @@ export function readData(reader, encoding, count, bitWidth) { * @param {DataReader} reader - buffer to read data from * @param {number} width - width of each bit-packed group * @param {number} length - length of the encoded data - * @param {number} numValues - number of values to read - * @returns {number[]} array of rle/bit-packed values + * @param {number[]} values - output array */ -export function readRleBitPackedHybrid(reader, width, length, numValues) { +export function readRleBitPackedHybrid(reader, width, length, values) { if (!length) { length = reader.view.getInt32(reader.offset, true) reader.offset += 4 if (length < 0) throw new Error(`parquet invalid rle/bitpack length ${length}`) } - /** @type {number[]} */ - const values = new Array(numValues) let seen = 0 const startOffset = reader.offset - while (reader.offset - startOffset < length && seen < numValues) { + while (reader.offset - startOffset < length && seen < values.length) { const [header, newOffset] = readVarInt(reader.view, reader.offset) reader.offset = newOffset if ((header & 1) === 0) { @@ -244,15 +214,11 @@ export function readRleBitPackedHybrid(reader, width, length, numValues) { seen += rle.length } else { // bit-packed - const bitPacked = readBitPacked( - reader, header, width, numValues - seen - ) + const bitPacked = readBitPacked(reader, header, width, values.length - seen) splice(values, bitPacked, seen) seen += bitPacked.length } } - - return values } /** diff --git a/test/encoding.test.js b/test/encoding.test.js index a4ed3bf..dede011 100644 --- a/test/encoding.test.js +++ b/test/encoding.test.js @@ -111,9 +111,10 @@ describe('readRleBitPackedHybrid', () => { view.setUint8(3, 0b00000100) // Bit-packed values (false, false, true) const reader = { view, offset: 0 } - const value = readRleBitPackedHybrid(reader, 1, 3, 6) + const values = new Array(6) + readRleBitPackedHybrid(reader, 1, 3, values) expect(reader.offset).toBe(4) - expect(value).toEqual([1, 1, 1, 0, 0, 1]) + expect(values).toEqual([1, 1, 1, 0, 0, 1]) }) it('reads RLE bit-packed hybrid values with implicit length', () => { @@ -127,8 +128,9 @@ describe('readRleBitPackedHybrid', () => { view.setUint8(7, 0b00000100) // Bit-packed values (false, false, true) const reader = { view, offset: 0 } - const value = readRleBitPackedHybrid(reader, 1, 0, 6) + const values = new Array(6) + readRleBitPackedHybrid(reader, 1, 0, values) expect(reader.offset).toBe(8) - expect(value).toEqual([1, 1, 1, 0, 0, 1]) + expect(values).toEqual([1, 1, 1, 0, 0, 1]) }) })