From ef8e1c8c718a84e968bdfd009ccb998eaa510b0d Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Tue, 17 Jun 2025 14:16:38 -0700 Subject: [PATCH] Fix bug when encoding length is zero (#93) --- src/datapage.js | 18 +++++------- src/encoding.js | 8 ++--- test/encoding.test.js | 16 +++++----- test/files/mostlyempty.json | 12 ++++++++ test/files/mostlyempty.metadata.json | 42 +++++++++++++++++++++++++++ test/files/mostlyempty.parquet | Bin 0 -> 134 bytes 6 files changed, 74 insertions(+), 22 deletions(-) create mode 100644 test/files/mostlyempty.json create mode 100644 test/files/mostlyempty.metadata.json create mode 100644 test/files/mostlyempty.parquet diff --git a/src/datapage.js b/src/datapage.js index 9d3a825..9749fdf 100644 --- a/src/datapage.js +++ b/src/datapage.js @@ -37,11 +37,11 @@ export function readDataPage(bytes, daph, { type, element, schemaPath }) { if (bitWidth) { dataPage = new Array(nValues) if (type === 'BOOLEAN') { - readRleBitPackedHybrid(reader, bitWidth, 0, dataPage) + readRleBitPackedHybrid(reader, bitWidth, dataPage) dataPage = dataPage.map(x => !!x) // convert to boolean } else { // assert(daph.encoding.endsWith('_DICTIONARY')) - readRleBitPackedHybrid(reader, bitWidth, view.byteLength - reader.offset, dataPage) + readRleBitPackedHybrid(reader, bitWidth, dataPage, view.byteLength - reader.offset) } } else { dataPage = new Uint8Array(nValues) // nValue zeroes @@ -74,7 +74,7 @@ function readRepetitionLevels(reader, daph, schemaPath) { const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath) if (maxRepetitionLevel) { const values = new Array(daph.num_values) - readRleBitPackedHybrid(reader, bitWidth(maxRepetitionLevel), 0, values) + readRleBitPackedHybrid(reader, bitWidth(maxRepetitionLevel), values) return values } } @@ -92,7 +92,7 @@ function readDefinitionLevels(reader, daph, schemaPath) { if (!maxDefinitionLevel) return { definitionLevels: [], numNulls: 0 } const definitionLevels = new Array(daph.num_values) - readRleBitPackedHybrid(reader, bitWidth(maxDefinitionLevel), 0, definitionLevels) + readRleBitPackedHybrid(reader, bitWidth(maxDefinitionLevel), definitionLevels) // count nulls let numNulls = daph.num_values @@ -173,7 +173,7 @@ export function readDataPageV2(compressedBytes, ph, columnDecoder) { } else if (daph2.encoding === 'RLE') { // assert(type === 'BOOLEAN') dataPage = new Array(nValues) - readRleBitPackedHybrid(pageReader, 1, 0, dataPage) + readRleBitPackedHybrid(pageReader, 1, dataPage) dataPage = dataPage.map(x => !!x) } else if ( daph2.encoding === 'PLAIN_DICTIONARY' || @@ -181,7 +181,7 @@ export function readDataPageV2(compressedBytes, ph, columnDecoder) { ) { const bitWidth = pageView.getUint8(pageReader.offset++) dataPage = new Array(nValues) - readRleBitPackedHybrid(pageReader, bitWidth, uncompressedPageSize - 1, dataPage) + readRleBitPackedHybrid(pageReader, bitWidth, dataPage, uncompressedPageSize - 1) } else if (daph2.encoding === 'DELTA_BINARY_PACKED') { const int32 = type === 'INT32' dataPage = int32 ? new Int32Array(nValues) : new BigInt64Array(nValues) @@ -212,9 +212,7 @@ function readRepetitionLevelsV2(reader, daph2, schemaPath) { if (!maxRepetitionLevel) return [] const values = new Array(daph2.num_values) - readRleBitPackedHybrid( - reader, bitWidth(maxRepetitionLevel), daph2.repetition_levels_byte_length, values - ) + readRleBitPackedHybrid(reader, bitWidth(maxRepetitionLevel), values, daph2.repetition_levels_byte_length) return values } @@ -229,7 +227,7 @@ function readDefinitionLevelsV2(reader, daph2, schemaPath) { if (maxDefinitionLevel) { // V2 we know the length const values = new Array(daph2.num_values) - readRleBitPackedHybrid(reader, bitWidth(maxDefinitionLevel), daph2.definition_levels_byte_length, values) + readRleBitPackedHybrid(reader, bitWidth(maxDefinitionLevel), values, daph2.definition_levels_byte_length) return values } } diff --git a/src/encoding.js b/src/encoding.js index de3fe41..8add018 100644 --- a/src/encoding.js +++ b/src/encoding.js @@ -16,12 +16,12 @@ export function bitWidth(value) { * If length is zero, then read int32 length at the start. * * @param {DataReader} reader - * @param {number} width - width of each bit-packed group - * @param {number} length - length of the encoded data + * @param {number} width - bitwidth * @param {DecodedArray} output + * @param {number} [length] - length of the encoded data */ -export function readRleBitPackedHybrid(reader, width, length, output) { - if (!length) { +export function readRleBitPackedHybrid(reader, width, output, length) { + if (length === undefined) { length = reader.view.getUint32(reader.offset, true) reader.offset += 4 } diff --git a/test/encoding.test.js b/test/encoding.test.js index 62f974f..6f9079a 100644 --- a/test/encoding.test.js +++ b/test/encoding.test.js @@ -14,7 +14,7 @@ describe('readRle', () => { const reader = { view, offset: 0 } const values = new Array(6) - readRleBitPackedHybrid(reader, 1, 4, values) + readRleBitPackedHybrid(reader, 1, values, 4) expect(reader.offset).toBe(4) expect(values).toEqual([1, 1, 1, 100, 100, 100]) }) @@ -28,7 +28,7 @@ describe('readRle', () => { const reader = { view, offset: 0 } const values = new Array(3) - readRleBitPackedHybrid(reader, 16, 6, values) + readRleBitPackedHybrid(reader, 16, values, 6) expect(reader.offset).toBe(6) expect(values).toEqual([65535, 65535, 65535]) }) @@ -44,7 +44,7 @@ describe('readRle', () => { const reader = { view, offset: 0 } const values = new Array(2) - readRleBitPackedHybrid(reader, 24, 4, values) + readRleBitPackedHybrid(reader, 24, values, 4) expect(reader.offset).toBe(4) expect(values).toEqual([16777215, 16777215]) }) @@ -58,7 +58,7 @@ describe('readRle', () => { const reader = { view, offset: 0 } const values = new Array(3) - readRleBitPackedHybrid(reader, 32, 5, values) + readRleBitPackedHybrid(reader, 32, values, 5) expect(reader.offset).toBe(5) expect(values).toEqual([234000, 234000, 234000]) }) @@ -75,7 +75,7 @@ describe('readBitPacked', () => { const reader = { view, offset: 0 } const values = new Array(3) - readRleBitPackedHybrid(reader, 1, 0, values) + readRleBitPackedHybrid(reader, 1, values) expect(reader.offset).toBe(6) expect(values).toEqual([0, 0, 1]) }) @@ -90,7 +90,7 @@ describe('readBitPacked', () => { const reader = { view, offset: 0 } const values = new Array(9) - readRleBitPackedHybrid(reader, 1, 3, values) + readRleBitPackedHybrid(reader, 1, values, 3) expect(reader.offset).toBe(3) expect(values).toEqual([1, 1, 1, 1, 1, 1, 1, 1, 1]) }) @@ -110,7 +110,7 @@ describe('readBitPacked', () => { const reader = { view, offset: 0 } const values = new Array(72) - readRleBitPackedHybrid(reader, 17, 154, values) + readRleBitPackedHybrid(reader, 17, values, 154) expect(reader.offset).toBe(154) expect(values).toEqual([ 131071, 0, 0, 0, 0, 0, 0, 0, @@ -132,7 +132,7 @@ describe('readBitPacked', () => { const reader = { view, offset: 0 } const values = new Array(3) - expect(() => readRleBitPackedHybrid(reader, 1, 3, values)) + expect(() => readRleBitPackedHybrid(reader, 1, values, 3)) .toThrow('parquet bitpack offset 1 out of range') }) }) diff --git a/test/files/mostlyempty.json b/test/files/mostlyempty.json new file mode 100644 index 0000000..a544c4c --- /dev/null +++ b/test/files/mostlyempty.json @@ -0,0 +1,12 @@ +[ + [null], + [null], + [null], + [null], + [null], + [null], + [null], + [null], + [null], + [null] +] diff --git a/test/files/mostlyempty.metadata.json b/test/files/mostlyempty.metadata.json new file mode 100644 index 0000000..8d8e74e --- /dev/null +++ b/test/files/mostlyempty.metadata.json @@ -0,0 +1,42 @@ +{ + "version": 2, + "schema": [ + { + "name": "root", + "num_children": 1 + }, + { + "type": "BYTE_ARRAY", + "repetition_type": "OPTIONAL", + "name": "empty" + } + ], + "num_rows": 10, + "row_groups": [ + { + "columns": [ + { + "file_offset": 4, + "meta_data": { + "type": "BYTE_ARRAY", + "encodings": ["RLE_DICTIONARY"], + "path_in_schema": ["empty"], + "codec": "SNAPPY", + "num_values": 10, + "total_uncompressed_size": 40, + "total_compressed_size": 40, + "data_page_offset": 18, + "dictionary_page_offset": 4, + "statistics": { + "null_count": 10 + } + } + } + ], + "total_byte_size": 40, + "num_rows": 10 + } + ], + "created_by": "hyparquet", + "metadata_length": 82 +} diff --git a/test/files/mostlyempty.parquet b/test/files/mostlyempty.parquet new file mode 100644 index 0000000000000000000000000000000000000000..3e9b54cfbc5bc05f67694837634d5bb1a3d2739f GIT binary patch literal 134 zcmWG=3^EjD5oHi%@&OVIKp@Hngj_MAB48*0l4M{IVPFKxut@57uoUIzmxwYki1Mg1 zNwB8o7L-&nh>1wbNXn>j$cXYtiV8?Vg+-afM8pEb0@PH1GG-z`Q-C}X1`W=P%7VnA P!qU_dh9IC|0MG>hV}2AA literal 0 HcmV?d00001