From f826bff75791ced10d593d55496b8e5871d50db3 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Wed, 17 Apr 2024 00:48:33 -0700 Subject: [PATCH] Use DataReader over Decoded. Fewer allocations, slightly faster. --- src/datapage.js | 64 +++++------ src/datapageV2.js | 61 +++++------ src/encoding.js | 246 +++++++++++++++++++----------------------- src/schema.js | 6 +- src/snappy.js | 6 +- src/types.d.ts | 5 + test/encoding.test.js | 139 +++++++++++++----------- 7 files changed, 256 insertions(+), 271 deletions(-) diff --git a/src/datapage.js b/src/datapage.js index 43f8ba2..6922f45 100644 --- a/src/datapage.js +++ b/src/datapage.js @@ -10,17 +10,13 @@ import { const skipNulls = false // TODO /** - * @typedef {{ byteLength: number, definitionLevels: number[], numNulls: number }} DefinitionLevels + * @typedef {{ definitionLevels: number[], numNulls: number }} DefinitionLevels * @typedef {import("./types.d.ts").DataPage} DataPage * @typedef {import("./types.d.ts").ColumnMetaData} ColumnMetaData * @typedef {import("./types.d.ts").DataPageHeader} DataPageHeader * @typedef {import("./types.d.ts").DictionaryPageHeader} DictionaryPageHeader * @typedef {import("./types.d.ts").SchemaElement} SchemaElement */ -/** - * @typedef {import("./types.d.ts").Decoded} Decoded - * @template T - */ /** * Read a data page from the given Uint8Array. @@ -32,16 +28,15 @@ const skipNulls = false // TODO * @returns {DataPage} definition levels, repetition levels, and array of values */ export function readDataPage(bytes, daph, schema, columnMetadata) { - const dataView = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength) - let offset = 0 + const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength) + const reader = { view, offset: 0 } /** @type {any[]} */ let values = [] // repetition levels - const { value: repetitionLevels, byteLength } = readRepetitionLevels( - dataView, offset, daph, schema, columnMetadata + const repetitionLevels = readRepetitionLevels( + reader, daph, schema, columnMetadata ) - offset += byteLength // definition levels let definitionLevels = undefined @@ -50,12 +45,11 @@ export function readDataPage(bytes, daph, schema, columnMetadata) { // TODO: move into readDefinitionLevels if (skipNulls && !isRequired(schema, columnMetadata.path_in_schema)) { // skip_definition_bytes - offset += skipDefinitionBytes(daph.num_values) + reader.offset += skipDefinitionBytes(daph.num_values) } else { - const dl = readDefinitionLevels(dataView, offset, daph, schema, columnMetadata.path_in_schema) + const dl = readDefinitionLevels(reader, daph, schema, columnMetadata.path_in_schema) definitionLevels = dl.definitionLevels numNulls = dl.numNulls - offset += dl.byteLength } // read values based on encoding @@ -63,9 +57,8 @@ export function readDataPage(bytes, daph, schema, columnMetadata) { if (daph.encoding === 'PLAIN') { const { element } = schemaElement(schema, columnMetadata.path_in_schema) const utf8 = element.converted_type === 'UTF8' - const plainObj = readPlain(dataView, columnMetadata.type, nValues, offset, utf8) - values = Array.isArray(plainObj.value) ? plainObj.value : Array.from(plainObj.value) - offset += plainObj.byteLength + const plainObj = readPlain(reader, columnMetadata.type, nValues, utf8) + values = Array.isArray(plainObj) ? plainObj : Array.from(plainObj) } else if ( daph.encoding === 'PLAIN_DICTIONARY' || daph.encoding === 'RLE_DICTIONARY' || @@ -77,14 +70,13 @@ export function readDataPage(bytes, daph, schema, columnMetadata) { if (columnMetadata.type === 'BOOLEAN') { bitWidth = 1 } else { - bitWidth = dataView.getUint8(offset) - offset += 1 + bitWidth = view.getUint8(reader.offset) + reader.offset++ } if (bitWidth) { - const { value, byteLength } = readRleBitPackedHybrid( - dataView, offset, bitWidth, dataView.byteLength - offset, nValues + const value = readRleBitPackedHybrid( + reader, bitWidth, view.byteLength - reader.offset, nValues ) - offset += byteLength values = Array.isArray(value) ? value : Array.from(value) } else { // nval zeros @@ -107,51 +99,51 @@ export function readDataPage(bytes, daph, schema, columnMetadata) { * @returns {ArrayLike} array of values */ export function readDictionaryPage(bytes, diph, schema, columnMetadata) { - const dataView = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength) - return readPlain(dataView, columnMetadata.type, diph.num_values, 0, false).value + const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength) + const reader = { view, offset: 0 } + return readPlain(reader, columnMetadata.type, diph.num_values, false) } /** * Read the repetition levels from this page, if any. * - * @param {DataView} dataView data view for the page - * @param {number} offset offset to start reading from + * @typedef {import("./types.d.ts").DataReader} DataReader + * @param {DataReader} reader data view for the page * @param {DataPageHeader} daph data page header * @param {SchemaElement[]} schema schema for the file * @param {ColumnMetaData} columnMetadata metadata for the column - * @returns {Decoded} repetition levels and number of bytes read + * @returns {any[]} repetition levels and number of bytes read */ -function readRepetitionLevels(dataView, offset, daph, schema, columnMetadata) { +function readRepetitionLevels(reader, daph, schema, columnMetadata) { if (columnMetadata.path_in_schema.length > 1) { const maxRepetitionLevel = getMaxRepetitionLevel(schema, columnMetadata.path_in_schema) if (maxRepetitionLevel) { const bitWidth = widthFromMaxInt(maxRepetitionLevel) return readData( - dataView, daph.repetition_level_encoding, offset, daph.num_values, bitWidth + reader, daph.repetition_level_encoding, daph.num_values, bitWidth ) } } - return { value: [], byteLength: 0 } + return [] } /** * Read the definition levels from this page, if any. * - * @param {DataView} dataView data view for the page - * @param {number} offset offset to start reading from + * @param {DataReader} reader data view for the page * @param {DataPageHeader} daph data page header * @param {SchemaElement[]} schema schema for the file * @param {string[]} path_in_schema path in the schema * @returns {DefinitionLevels} definition levels and number of bytes read */ -function readDefinitionLevels(dataView, offset, daph, schema, path_in_schema) { +function readDefinitionLevels(reader, daph, schema, path_in_schema) { if (!isRequired(schema, path_in_schema)) { const maxDefinitionLevel = getMaxDefinitionLevel(schema, path_in_schema) const bitWidth = widthFromMaxInt(maxDefinitionLevel) if (bitWidth) { // num_values is index 1 for either type of page header - const { value: definitionLevels, byteLength } = readData( - dataView, daph.definition_level_encoding, offset, daph.num_values, bitWidth + const definitionLevels = readData( + reader, daph.definition_level_encoding, daph.num_values, bitWidth ) // count nulls @@ -163,8 +155,8 @@ function readDefinitionLevels(dataView, offset, daph, schema, path_in_schema) { definitionLevels.length = 0 } - return { byteLength, definitionLevels, numNulls } + return { definitionLevels, numNulls } } } - return { byteLength: 0, definitionLevels: [], numNulls: 0 } + return { definitionLevels: [], numNulls: 0 } } diff --git a/src/datapageV2.js b/src/datapageV2.js index d8c6181..a2e0a58 100644 --- a/src/datapageV2.js +++ b/src/datapageV2.js @@ -3,11 +3,6 @@ import { readPlain, readRleBitPackedHybrid, widthFromMaxInt } from './encoding.j import { getMaxDefinitionLevel, getMaxRepetitionLevel, schemaElement } from './schema.js' import { readVarInt, readZigZag } from './thrift.js' -/** - * @typedef {import("./types.d.ts").Decoded} Decoded - * @template T - */ - /** * Read a data page from the given Uint8Array. * @@ -25,8 +20,8 @@ import { readVarInt, readZigZag } from './thrift.js' * @returns {DataPage} definition levels, repetition levels, and array of values */ export function readDataPageV2(compressedBytes, ph, schema, columnMetadata, compressors) { - const dataView = new DataView(compressedBytes.buffer, compressedBytes.byteOffset, compressedBytes.byteLength) - let offset = 0 + const view = new DataView(compressedBytes.buffer, compressedBytes.byteOffset, compressedBytes.byteLength) + const reader = { view, offset: 0 } /** @type {any} */ let values = [] @@ -34,13 +29,19 @@ export function readDataPageV2(compressedBytes, ph, schema, columnMetadata, comp if (!daph2) throw new Error('parquet data page header v2 is undefined') // repetition levels - const repetitionLevels = readRepetitionLevelsV2(dataView, offset, daph2, schema, columnMetadata) + const repetitionLevels = readRepetitionLevelsV2(reader, daph2, schema, columnMetadata) + + if (reader.offset !== daph2.repetition_levels_byte_length) { + throw new Error(`parquet repetition levels byte length ${reader.offset} does not match expected ${daph2.repetition_levels_byte_length}`) + } // definition levels - offset += daph2.repetition_levels_byte_length const maxDefinitionLevel = getMaxDefinitionLevel(schema, columnMetadata.path_in_schema) - const definitionLevels = readDefinitionLevelsV2(dataView, offset, daph2, maxDefinitionLevel) - offset += daph2.definition_levels_byte_length + const definitionLevels = readDefinitionLevelsV2(reader, daph2, maxDefinitionLevel) + + if (reader.offset !== daph2.repetition_levels_byte_length + daph2.definition_levels_byte_length) { + throw new Error(`parquet definition levels byte length ${reader.offset} does not match expected ${daph2.repetition_levels_byte_length + daph2.definition_levels_byte_length}`) + } const uncompressedPageSize = ph.uncompressed_page_size - daph2.definition_levels_byte_length - daph2.repetition_levels_byte_length @@ -49,13 +50,13 @@ export function readDataPageV2(compressedBytes, ph, schema, columnMetadata, comp if (daph2.encoding === 'PLAIN') { const { element } = schemaElement(schema, columnMetadata.path_in_schema) const utf8 = element.converted_type === 'UTF8' - let page = compressedBytes.slice(offset) + let page = compressedBytes.slice(reader.offset) if (daph2.is_compressed && columnMetadata.codec !== 'UNCOMPRESSED') { page = decompressPage(page, uncompressedPageSize, columnMetadata.codec, compressors) } const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength) - const plainObj = readPlain(pageView, columnMetadata.type, nValues, 0, utf8) - values = plainObj.value + const pageReader = { view: pageView, offset: 0 } + values = readPlain(pageReader, columnMetadata.type, nValues, utf8) } else if (daph2.encoding === 'RLE') { const page = decompressPage(compressedBytes, uncompressedPageSize, columnMetadata.codec, compressors) const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength) @@ -63,21 +64,22 @@ export function readDataPageV2(compressedBytes, ph, schema, columnMetadata, comp if (daph2.num_nulls) { throw new Error('parquet RLE encoding with nulls not supported') } else { + const pageReader = { view: pageView, offset: 4 } values = readRleBitPackedHybrid( - pageView, 4, bitWidth, uncompressedPageSize, nValues - ).value + pageReader, bitWidth, uncompressedPageSize, nValues + ) } } else if ( daph2.encoding === 'PLAIN_DICTIONARY' || daph2.encoding === 'RLE_DICTIONARY' ) { - compressedBytes = compressedBytes.subarray(offset) + compressedBytes = compressedBytes.subarray(reader.offset) const page = decompressPage(compressedBytes, uncompressedPageSize, columnMetadata.codec, compressors) const pageView = new DataView(page.buffer, page.byteOffset, page.byteLength) - const bitWidth = pageView.getUint8(0) - const { value } = readRleBitPackedHybrid( - pageView, 1, bitWidth, uncompressedPageSize, nValues + const pageReader = { view: pageView, offset: 1 } + const value = readRleBitPackedHybrid( + pageReader, bitWidth, uncompressedPageSize, nValues ) values = value } else if (daph2.encoding === 'DELTA_BINARY_PACKED') { @@ -95,40 +97,39 @@ export function readDataPageV2(compressedBytes, ph, schema, columnMetadata, comp /** * Read the repetition levels from this page, if any. * - * @param {DataView} dataView data view for the page - * @param {number} offset offset to start reading from + * @typedef {import("./types.d.ts").DataReader} DataReader + * @param {DataReader} reader data view for the page * @param {DataPageHeaderV2} daph2 data page header * @param {SchemaElement[]} schema schema for the file * @param {ColumnMetaData} columnMetadata metadata for the column * @returns {any[]} repetition levels and number of bytes read */ -export function readRepetitionLevelsV2(dataView, offset, daph2, schema, columnMetadata) { +export function readRepetitionLevelsV2(reader, daph2, schema, columnMetadata) { const maxRepetitionLevel = getMaxRepetitionLevel(schema, columnMetadata.path_in_schema) if (!maxRepetitionLevel) return [] const bitWidth = widthFromMaxInt(maxRepetitionLevel) // num_values is index 1 for either type of page header return readRleBitPackedHybrid( - dataView, offset, bitWidth, daph2.repetition_levels_byte_length, daph2.num_values - ).value + reader, bitWidth, daph2.repetition_levels_byte_length, daph2.num_values + ) } /** * Read the definition levels from this page, if any. * - * @param {DataView} dataView data view for the page - * @param {number} offset offset to start reading from + * @param {DataReader} reader data view for the page * @param {DataPageHeaderV2} daph2 data page header v2 * @param {number} maxDefinitionLevel maximum definition level for this column * @returns {number[] | undefined} definition levels and number of bytes read */ -function readDefinitionLevelsV2(dataView, offset, daph2, maxDefinitionLevel) { +function readDefinitionLevelsV2(reader, daph2, maxDefinitionLevel) { if (maxDefinitionLevel) { // not the same as V1, because we know the length const bitWidth = widthFromMaxInt(maxDefinitionLevel) return readRleBitPackedHybrid( - dataView, offset, bitWidth, daph2.definition_levels_byte_length, daph2.num_values - ).value + reader, bitWidth, daph2.definition_levels_byte_length, daph2.num_values + ) } } diff --git a/src/encoding.js b/src/encoding.js index 81243ff..909f2b6 100644 --- a/src/encoding.js +++ b/src/encoding.js @@ -1,189 +1,176 @@ import { readVarInt } from './thrift.js' import { concat } from './utils.js' -/** - * Return type with bytes read. - * This is useful to advance an offset through a buffer. - * - * @typedef {import("./types.d.ts").Decoded} Decoded - * @template T - */ - /** * Read `count` boolean values. * - * @param {DataView} dataView - buffer to read data from - * @param {number} offset - offset to start reading from the DataView + * @param {DataReader} reader - buffer to read data from * @param {number} count - number of values to read - * @returns {Decoded} array of boolean values + * @returns {boolean[]} array of boolean values */ -function readPlainBoolean(dataView, offset, count) { +function readPlainBoolean(reader, count) { const value = [] for (let i = 0; i < count; i++) { - const byteOffset = offset + Math.floor(i / 8) + const byteOffset = reader.offset + Math.floor(i / 8) const bitOffset = i % 8 - const byte = dataView.getUint8(byteOffset) + const byte = reader.view.getUint8(byteOffset) value.push((byte & (1 << bitOffset)) !== 0) } - return { value, byteLength: Math.ceil(count / 8) } + reader.offset += Math.ceil(count / 8) + return value } /** * Read `count` int32 values. * - * @param {DataView} dataView - buffer to read data from - * @param {number} offset - offset to start reading from the DataView + * @param {DataReader} reader - buffer to read data from * @param {number} count - number of values to read - * @returns {Decoded} array of int32 values + * @returns {number[]} array of int32 values */ -function readPlainInt32(dataView, offset, count) { +function readPlainInt32(reader, count) { const value = [] for (let i = 0; i < count; i++) { - value.push(dataView.getInt32(offset + i * 4, true)) + value.push(reader.view.getInt32(reader.offset + i * 4, true)) } - return { value, byteLength: count * 4 } + reader.offset += count * 4 + return value } /** * Read `count` int64 values. * - * @param {DataView} dataView - buffer to read data from - * @param {number} offset - offset to start reading from the DataView + * @param {DataReader} reader - buffer to read data from * @param {number} count - number of values to read - * @returns {Decoded} array of int64 values + * @returns {bigint[]} array of int64 values */ -function readPlainInt64(dataView, offset, count) { +function readPlainInt64(reader, count) { const value = [] for (let i = 0; i < count; i++) { - value.push(dataView.getBigInt64(offset + i * 8, true)) + value.push(reader.view.getBigInt64(reader.offset + i * 8, true)) } - return { value, byteLength: count * 8 } + reader.offset += count * 8 + return value } /** * Read `count` int96 values. * - * @param {DataView} dataView - buffer to read data from - * @param {number} offset - offset to start reading from the DataView + * @param {DataReader} reader - buffer to read data from * @param {number} count - number of values to read - * @returns {Decoded} array of int96 values + * @returns {bigint[]} array of int96 values */ -function readPlainInt96(dataView, offset, count) { +function readPlainInt96(reader, count) { const value = [] for (let i = 0; i < count; i++) { - const low = dataView.getBigInt64(offset + i * 12, true) - const high = dataView.getInt32(offset + i * 12 + 8, true) + const low = reader.view.getBigInt64(reader.offset + i * 12, true) + const high = reader.view.getInt32(reader.offset + i * 12 + 8, true) value.push((BigInt(high) << BigInt(32)) | low) } - return { value, byteLength: count * 12 } + reader.offset += count * 12 + return value } /** * Read `count` float values. * - * @param {DataView} dataView - buffer to read data from - * @param {number} offset - offset to start reading from the DataView + * @param {DataReader} reader - buffer to read data from * @param {number} count - number of values to read - * @returns {Decoded} array of float values + * @returns {number[]} array of float values */ -function readPlainFloat(dataView, offset, count) { +function readPlainFloat(reader, count) { const value = [] for (let i = 0; i < count; i++) { - value.push(dataView.getFloat32(offset + i * 4, true)) + value.push(reader.view.getFloat32(reader.offset + i * 4, true)) } - return { value, byteLength: count * 4 } + reader.offset += count * 4 + return value } /** * Read `count` double values. * - * @param {DataView} dataView - buffer to read data from - * @param {number} offset - offset to start reading from the DataView + * @param {DataReader} reader - buffer to read data from * @param {number} count - number of values to read - * @returns {Decoded} array of double values + * @returns {number[]} array of double values */ -function readPlainDouble(dataView, offset, count) { +function readPlainDouble(reader, count) { const value = [] for (let i = 0; i < count; i++) { - value.push(dataView.getFloat64(offset + i * 8, true)) + value.push(reader.view.getFloat64(reader.offset + i * 8, true)) } - return { value, byteLength: count * 8 } + reader.offset += count * 8 + return value } /** * Read `count` byte array values. * - * @param {DataView} dataView - buffer to read data from - * @param {number} offset - offset to start reading from the DataView + * @param {DataReader} reader - buffer to read data from * @param {number} count - number of values to read - * @returns {Decoded} array of byte arrays + * @returns {Uint8Array[]} array of byte arrays */ -function readPlainByteArray(dataView, offset, count) { +function readPlainByteArray(reader, count) { const value = [] - let byteLength = 0 // byte length of all data read for (let i = 0; i < count; i++) { - const length = dataView.getInt32(offset + byteLength, true) - byteLength += 4 - const bytes = new Uint8Array(dataView.buffer, dataView.byteOffset + offset + byteLength, length) + const length = reader.view.getInt32(reader.offset, true) + reader.offset += 4 + const bytes = new Uint8Array(reader.view.buffer, reader.view.byteOffset + reader.offset, length) value.push(bytes) - byteLength += length + reader.offset += length } - return { value, byteLength } + return value } /** * Read a fixed length byte array. * - * @param {DataView} dataView - buffer to read data from - * @param {number} offset - offset to start reading from the DataView + * @param {DataReader} reader - buffer to read data from * @param {number} fixedLength - length of each fixed length byte array - * @returns {Decoded} array of fixed length byte arrays + * @returns {Uint8Array} array of fixed length byte arrays */ -function readPlainByteArrayFixed(dataView, offset, fixedLength) { - return { - value: new Uint8Array(dataView.buffer, dataView.byteOffset + offset, fixedLength), - byteLength: fixedLength, - } +function readPlainByteArrayFixed(reader, fixedLength) { + reader.offset += fixedLength + return new Uint8Array( + reader.view.buffer, + reader.view.byteOffset + reader.offset - fixedLength, + fixedLength + ) } /** - * Read `count` values of the given type from the dataView. + * Read `count` values of the given type from the reader.view. * * @typedef {import("./types.d.ts").DecodedArray} DecodedArray * @typedef {import("./types.d.ts").ParquetType} ParquetType - * @param {DataView} dataView - buffer to read data from + * @param {DataReader} reader - buffer to read data from * @param {ParquetType} type - parquet type of the data * @param {number} count - number of values to read - * @param {number} offset - offset to start reading from the DataView * @param {boolean} utf8 - whether to decode byte arrays as UTF-8 - * @returns {Decoded} array of values + * @returns {DecodedArray} array of values */ -export function readPlain(dataView, type, count, offset, utf8) { - if (count === 0) return { value: [], byteLength: 0 } +export function readPlain(reader, type, count, utf8) { + if (count === 0) return [] if (type === 'BOOLEAN') { - return readPlainBoolean(dataView, offset, count) + return readPlainBoolean(reader, count) } else if (type === 'INT32') { - return readPlainInt32(dataView, offset, count) + return readPlainInt32(reader, count) } else if (type === 'INT64') { - return readPlainInt64(dataView, offset, count) + return readPlainInt64(reader, count) } else if (type === 'INT96') { - return readPlainInt96(dataView, offset, count) + return readPlainInt96(reader, count) } else if (type === 'FLOAT') { - return readPlainFloat(dataView, offset, count) + return readPlainFloat(reader, count) } else if (type === 'DOUBLE') { - return readPlainDouble(dataView, offset, count) + return readPlainDouble(reader, count) } else if (type === 'BYTE_ARRAY') { - const byteArray = readPlainByteArray(dataView, offset, count) + const byteArray = readPlainByteArray(reader, count) if (utf8) { const decoder = new TextDecoder() - return { - value: byteArray.value.map(bytes => decoder.decode(bytes)), - byteLength: byteArray.byteLength, - } + return byteArray.map(bytes => decoder.decode(bytes)) } return byteArray } else if (type === 'FIXED_LEN_BYTE_ARRAY') { - return readPlainByteArrayFixed(dataView, offset, count) + return readPlainByteArrayFixed(reader, count) } else { throw new Error(`parquet unhandled type: ${type}`) } @@ -204,30 +191,27 @@ export function widthFromMaxInt(value) { * The data could be definition levels, repetition levels, or actual values. * * @typedef {import("./types.d.ts").Encoding} Encoding - * @param {DataView} dataView - buffer to read data from + * @param {DataReader} reader - buffer to read data from * @param {Encoding} encoding - encoding type - * @param {number} offset - offset to start reading from the DataView * @param {number} count - number of values to read * @param {number} bitWidth - width of each bit-packed group - * @returns {Decoded} array of values + * @returns {any[]} array of values */ -export function readData(dataView, encoding, offset, count, bitWidth) { +export function readData(reader, encoding, count, bitWidth) { /** @type {any[]} */ const value = [] - let byteLength = 0 if (encoding === 'RLE') { let seen = 0 while (seen < count) { - const rle = readRleBitPackedHybrid(dataView, offset + byteLength, bitWidth, 0, count) - if (!rle.value.length) break // EOF - concat(value, rle.value) - seen += rle.value.length - byteLength += rle.byteLength + const rle = readRleBitPackedHybrid(reader, bitWidth, 0, count) + if (!rle.length) break // EOF + concat(value, rle) + seen += rle.length } } else { throw new Error(`parquet encoding not supported ${encoding}`) } - return { value, byteLength } + return value } /** @@ -235,42 +219,39 @@ export function readData(dataView, encoding, offset, count, bitWidth) { * * If length is zero, then read as int32 at the start of the encoded data. * - * @param {DataView} dataView - buffer to read data from - * @param {number} offset - offset to start reading from the DataView + * @typedef {import("./types.d.ts").DataReader} DataReader + * @param {DataReader} reader - buffer to read data from * @param {number} width - width of each bit-packed group * @param {number} length - length of the encoded data * @param {number} numValues - number of values to read - * @returns {Decoded} array of rle/bit-packed values + * @returns {number[]} array of rle/bit-packed values */ -export function readRleBitPackedHybrid(dataView, offset, width, length, numValues) { - let byteLength = 0 +export function readRleBitPackedHybrid(reader, width, length, numValues) { if (!length) { - length = dataView.getInt32(offset, true) + length = reader.view.getInt32(reader.offset, true) + reader.offset += 4 if (length < 0) throw new Error(`parquet invalid rle/bitpack length ${length}`) - byteLength += 4 } /** @type {number[]} */ const value = [] - const startByteLength = byteLength - while (byteLength - startByteLength < length && value.length < numValues) { - const [header, newOffset] = readVarInt(dataView, offset + byteLength) - byteLength = newOffset - offset + const startOffset = reader.offset + while (reader.offset - startOffset < length && value.length < numValues) { + const [header, newOffset] = readVarInt(reader.view, reader.offset) + reader.offset = newOffset if ((header & 1) === 0) { // rle - const rle = readRle(dataView, offset + byteLength, header, width) - concat(value, rle.value) - byteLength += rle.byteLength + const rle = readRle(reader, header, width) + concat(value, rle) } else { // bit-packed const bitPacked = readBitPacked( - dataView, offset + byteLength, header, width, numValues - value.length + reader, header, width, numValues - value.length ) - concat(value, bitPacked.value) - byteLength += bitPacked.byteLength + concat(value, bitPacked) } } - return { value, byteLength } + return value } /** @@ -279,26 +260,24 @@ export function readRleBitPackedHybrid(dataView, offset, width, length, numValue * The count is determined from the header and the width is used to grab the * value that's repeated. Yields the value repeated count times. * - * @param {DataView} dataView - buffer to read data from - * @param {number} offset - offset to start reading from the DataView + * @param {DataReader} reader - buffer to read data from * @param {number} header - header information * @param {number} bitWidth - width of each bit-packed group - * @returns {Decoded} array of rle values + * @returns {number[]} array of rle values */ -function readRle(dataView, offset, header, bitWidth) { +function readRle(reader, header, bitWidth) { const count = header >>> 1 const width = (bitWidth + 7) >> 3 - let byteLength = 0 let readValue if (width === 1) { - readValue = dataView.getUint8(offset) - byteLength += 1 + readValue = reader.view.getUint8(reader.offset) + reader.offset++ } else if (width === 2) { - readValue = dataView.getUint16(offset, true) - byteLength += 2 + readValue = reader.view.getUint16(reader.offset, true) + reader.offset += 2 } else if (width === 4) { - readValue = dataView.getUint32(offset, true) - byteLength += 4 + readValue = reader.view.getUint32(reader.offset, true) + reader.offset += 4 } else { throw new Error(`parquet invalid rle width ${width}`) } @@ -308,33 +287,32 @@ function readRle(dataView, offset, header, bitWidth) { for (let i = 0; i < count; i++) { value.push(readValue) } - return { value, byteLength } + return value } /** * Read a bit-packed run of the rle/bitpack hybrid. * Supports width > 8 (crossing bytes). * - * @param {DataView} dataView - buffer to read data from - * @param {number} offset - offset to start reading from the DataView + * @param {DataReader} reader - buffer to read data from * @param {number} header - header information * @param {number} bitWidth - width of each bit-packed group * @param {number} remaining - number of values remaining to be read - * @returns {Decoded} array of bit-packed values + * @returns {number[]} array of bit-packed values */ -function readBitPacked(dataView, offset, header, bitWidth, remaining) { +function readBitPacked(reader, header, bitWidth, remaining) { // extract number of values to read from header let count = (header >> 1) << 3 const mask = maskForBits(bitWidth) // Sometimes it tries to read outside of available memory, but it will be masked out anyway let data = 0 - if (offset < dataView.byteLength) { - data = dataView.getUint8(offset) + if (reader.offset < reader.view.byteLength) { + data = reader.view.getUint8(reader.offset) + reader.offset++ } else if (mask) { - throw new Error(`parquet bitpack offset ${offset} out of range`) + throw new Error(`parquet bitpack offset ${reader.offset} out of range`) } - let byteLength = 1 let left = 8 let right = 0 /** @type {number[]} */ @@ -349,8 +327,8 @@ function readBitPacked(dataView, offset, header, bitWidth, remaining) { data >>= 8 } else if (left - right < bitWidth) { // if we don't have bitWidth number of bits to read, read next byte - data |= dataView.getUint8(offset + byteLength) << left - byteLength++ + data |= reader.view.getUint8(reader.offset) << left + reader.offset++ left += 8 } else { // otherwise, read bitWidth number of bits @@ -367,7 +345,7 @@ function readBitPacked(dataView, offset, header, bitWidth, remaining) { } // return values and number of bytes read - return { value, byteLength } + return value } /** diff --git a/src/schema.js b/src/schema.js index 39038a6..d0769e9 100644 --- a/src/schema.js +++ b/src/schema.js @@ -79,7 +79,7 @@ export function getMaxRepetitionLevel(schema, parts) { parts.forEach((part, i) => { const { element } = schemaElement(schema, parts.slice(0, i + 1)) if (element.repetition_type === 'REPEATED') { - maxLevel += 1 + maxLevel++ } }) return maxLevel @@ -97,7 +97,7 @@ export function getMaxDefinitionLevel(schema, parts) { parts.forEach((part, i) => { const { element } = schemaElement(schema, parts.slice(0, i + 1)) if (element.repetition_type !== 'REQUIRED') { - maxLevel += 1 + maxLevel++ } }) return maxLevel @@ -113,7 +113,7 @@ export function skipDefinitionBytes(num) { let byteLength = 6 let n = num >>> 8 while (n !== 0) { - byteLength += 1 + byteLength++ n >>>= 7 } return byteLength diff --git a/src/snappy.js b/src/snappy.js index d0dc4e7..ae82aa4 100644 --- a/src/snappy.js +++ b/src/snappy.js @@ -54,7 +54,7 @@ export function snappyUncompress(input, output) { // skip preamble (contains uncompressed length as varint) while (pos < inputLength) { const c = input[pos] - pos += 1 + pos++ if (c < 128) { break } @@ -66,7 +66,7 @@ export function snappyUncompress(input, output) { while (pos < inputLength) { const c = input[pos] let len = 0 - pos += 1 + pos++ if (pos >= inputLength) { throw new Error('missing eof marker') @@ -103,7 +103,7 @@ export function snappyUncompress(input, output) { // Copy with 1-byte offset len = ((c >>> 2) & 0x7) + 4 offset = input[pos] + ((c >>> 5) << 8) - pos += 1 + pos++ break case 2: // Copy with 2-byte offset diff --git a/src/types.d.ts b/src/types.d.ts index cb1c233..d1ae293 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -15,6 +15,11 @@ export interface Decoded { byteLength: number } +export interface DataReader { + view: DataView + offset: number +} + // Parquet file metadata types export interface FileMetaData { version: number diff --git a/test/encoding.test.js b/test/encoding.test.js index 5b52c37..a4ed3bf 100644 --- a/test/encoding.test.js +++ b/test/encoding.test.js @@ -4,89 +4,96 @@ import { readPlain, readRleBitPackedHybrid } from '../src/encoding.js' describe('readPlain', () => { it('reads BOOLEAN values correctly', () => { - const dataView = new DataView(new ArrayBuffer(1)) - dataView.setUint8(0, 0b00000001) // Set the first bit to 1 - const result = readPlain(dataView, 'BOOLEAN', 1, 0, false) - expect(result).toEqual({ value: [true], byteLength: 1 }) + const view = new DataView(new ArrayBuffer(1)) + view.setUint8(0, 0b00000001) // Set the first bit to 1 + const reader = { view, offset: 0 } + const result = readPlain(reader, 'BOOLEAN', 1, false) + expect(result).toEqual([true]) + expect(reader.offset).toBe(1) }) it('reads INT32 values correctly', () => { - const dataView = new DataView(new ArrayBuffer(4)) - dataView.setInt32(0, 123456789, true) // little-endian - const result = readPlain(dataView, 'INT32', 1, 0, false) - expect(result).toEqual({ value: [123456789], byteLength: 4 }) + const view = new DataView(new ArrayBuffer(4)) + view.setInt32(0, 123456789, true) // little-endian + const reader = { view, offset: 0 } + const result = readPlain(reader, 'INT32', 1, false) + expect(result).toEqual([123456789]) + expect(reader.offset).toBe(4) }) it('reads INT64 values correctly', () => { - const dataView = new DataView(new ArrayBuffer(8)) - dataView.setBigInt64(0, BigInt('1234567890123456789'), true) - const result = readPlain(dataView, 'INT64', 1, 0, false) - expect(result).toEqual({ value: [1234567890123456789n], byteLength: 8 }) + const view = new DataView(new ArrayBuffer(8)) + view.setBigInt64(0, BigInt('1234567890123456789'), true) + const reader = { view, offset: 0 } + const result = readPlain(reader, 'INT64', 1, false) + expect(result).toEqual([1234567890123456789n]) + expect(reader.offset).toBe(8) }) it('reads INT96 values correctly', () => { const buffer = new ArrayBuffer(12) - const dataView = new DataView(buffer) + const view = new DataView(buffer) // Example INT96 value split into 64-bit low part and 32-bit high part const low = BigInt('0x0123456789ABCDEF') const high = 0x02345678 - dataView.setBigInt64(0, low, true) - dataView.setInt32(8, high, true) + view.setBigInt64(0, low, true) + view.setInt32(8, high, true) + const reader = { view, offset: 0 } + const result = readPlain(reader, 'INT96', 1, false) const expectedValue = (BigInt(high) << BigInt(32)) | low - - const result = readPlain(dataView, 'INT96', 1, 0, false) - expect(result).toEqual({ - value: [expectedValue], - byteLength: 12, - }) + expect(result).toEqual([expectedValue]) + expect(reader.offset).toBe(12) }) it('reads FLOAT values correctly', () => { - const dataView = new DataView(new ArrayBuffer(4)) - dataView.setFloat32(0, 1234.5, true) // little-endian - const result = readPlain(dataView, 'FLOAT', 1, 0, false) - expect(result).toEqual({ value: [1234.5], byteLength: 4 }) + const view = new DataView(new ArrayBuffer(4)) + view.setFloat32(0, 1234.5, true) // little-endian + const reader = { view, offset: 0 } + const result = readPlain(reader, 'FLOAT', 1, false) + expect(result).toEqual([1234.5]) + expect(reader.offset).toBe(4) }) it('reads DOUBLE values correctly', () => { - const dataView = new DataView(new ArrayBuffer(8)) - dataView.setFloat64(0, 12345.6789, true) // little-endian - const result = readPlain(dataView, 'DOUBLE', 1, 0, false) - expect(result).toEqual({ value: [12345.6789], byteLength: 8 }) + const view = new DataView(new ArrayBuffer(8)) + view.setFloat64(0, 12345.6789, true) // little-endian + const reader = { view, offset: 0 } + const result = readPlain(reader, 'DOUBLE', 1, false) + expect(result).toEqual([12345.6789]) + expect(reader.offset).toBe(8) }) it('reads BYTE_ARRAY values correctly', () => { - const dataView = new DataView(new ArrayBuffer(10)) - dataView.setInt32(0, 3, true) // length of the first byte array - dataView.setUint8(4, 1) // first byte array data - dataView.setUint8(5, 2) - dataView.setUint8(6, 3) - const result = readPlain(dataView, 'BYTE_ARRAY', 1, 0, false) - expect(result).toEqual({ - value: [new Uint8Array([1, 2, 3])], - byteLength: 7, - }) + const view = new DataView(new ArrayBuffer(10)) + view.setInt32(0, 3, true) // length of the first byte array + view.setUint8(4, 1) // first byte array data + view.setUint8(5, 2) + view.setUint8(6, 3) + const reader = { view, offset: 0 } + const result = readPlain(reader, 'BYTE_ARRAY', 1, false) + expect(result).toEqual([new Uint8Array([1, 2, 3])]) + expect(reader.offset).toBe(7) }) it('reads FIXED_LEN_BYTE_ARRAY values correctly', () => { const fixedLength = 3 - const dataView = new DataView(new ArrayBuffer(fixedLength)) - dataView.setUint8(0, 4) - dataView.setUint8(1, 5) - dataView.setUint8(2, 6) - const result = readPlain(dataView, 'FIXED_LEN_BYTE_ARRAY', fixedLength, 0, false) - expect(result).toEqual({ - value: new Uint8Array([4, 5, 6]), - byteLength: fixedLength, - }) + const view = new DataView(new ArrayBuffer(fixedLength)) + view.setUint8(0, 4) + view.setUint8(1, 5) + view.setUint8(2, 6) + const reader = { view, offset: 0 } + const result = readPlain(reader, 'FIXED_LEN_BYTE_ARRAY', fixedLength, false) + expect(result).toEqual(new Uint8Array([4, 5, 6])) + expect(reader.offset).toBe(fixedLength) }) it('throws an error for unhandled types', () => { - const dataView = new DataView(new ArrayBuffer(0)) + const view = new DataView(new ArrayBuffer(0)) + const reader = { view, offset: 0 } /** @type any */ const invalidType = 'invalidType' - expect(() => readPlain(dataView, invalidType, 1, 0, false)) + expect(() => readPlain(reader, invalidType, 1, false)) .toThrow(`parquet unhandled type: ${invalidType}`) }) }) @@ -97,29 +104,31 @@ describe('readRleBitPackedHybrid', () => { // RLE values: true x3 // Bit-packed values: false, false, true const buffer = new ArrayBuffer(4) - const dataView = new DataView(buffer) - dataView.setUint8(0, 0b00000110) // RLE header for 3 true values - dataView.setUint8(1, 0b00000001) // RLE value (true) - dataView.setUint8(2, 0b00000011) // Bit-packed header for 3 values - dataView.setUint8(3, 0b00000100) // Bit-packed values (false, false, true) + const view = new DataView(buffer) + view.setUint8(0, 0b00000110) // RLE header for 3 true values + view.setUint8(1, 0b00000001) // RLE value (true) + view.setUint8(2, 0b00000011) // Bit-packed header for 3 values + view.setUint8(3, 0b00000100) // Bit-packed values (false, false, true) + const reader = { view, offset: 0 } - const { byteLength, value } = readRleBitPackedHybrid(dataView, 0, 1, 3, 6) - expect(byteLength).toBe(4) + const value = readRleBitPackedHybrid(reader, 1, 3, 6) + expect(reader.offset).toBe(4) expect(value).toEqual([1, 1, 1, 0, 0, 1]) }) it('reads RLE bit-packed hybrid values with implicit length', () => { // Example buffer: same as previous test, but with implicit length const buffer = new ArrayBuffer(8) - const dataView = new DataView(buffer) - dataView.setInt32(0, 3, true) // length 3 little-endian - dataView.setUint8(4, 0b00000110) // RLE header for 3 true values - dataView.setUint8(5, 0b00000001) // RLE value (true) - dataView.setUint8(6, 0b00000011) // Bit-packed header for 3 values - dataView.setUint8(7, 0b00000100) // Bit-packed values (false, false, true) + const view = new DataView(buffer) + view.setInt32(0, 3, true) // length 3 little-endian + view.setUint8(4, 0b00000110) // RLE header for 3 true values + view.setUint8(5, 0b00000001) // RLE value (true) + view.setUint8(6, 0b00000011) // Bit-packed header for 3 values + view.setUint8(7, 0b00000100) // Bit-packed values (false, false, true) + const reader = { view, offset: 0 } - const { byteLength, value } = readRleBitPackedHybrid(dataView, 0, 1, 0, 6) - expect(byteLength).toBe(8) + const value = readRleBitPackedHybrid(reader, 1, 0, 6) + expect(reader.offset).toBe(8) expect(value).toEqual([1, 1, 1, 0, 0, 1]) }) })