Fix bug when encoding length is zero (#93)

This commit is contained in:
Kenny Daniel 2025-06-17 14:16:38 -07:00 committed by GitHub
parent 2eb793c30f
commit ef8e1c8c71
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 74 additions and 22 deletions

@ -37,11 +37,11 @@ export function readDataPage(bytes, daph, { type, element, schemaPath }) {
if (bitWidth) {
dataPage = new Array(nValues)
if (type === 'BOOLEAN') {
readRleBitPackedHybrid(reader, bitWidth, 0, dataPage)
readRleBitPackedHybrid(reader, bitWidth, dataPage)
dataPage = dataPage.map(x => !!x) // convert to boolean
} else {
// assert(daph.encoding.endsWith('_DICTIONARY'))
readRleBitPackedHybrid(reader, bitWidth, view.byteLength - reader.offset, dataPage)
readRleBitPackedHybrid(reader, bitWidth, dataPage, view.byteLength - reader.offset)
}
} else {
dataPage = new Uint8Array(nValues) // nValue zeroes
@ -74,7 +74,7 @@ function readRepetitionLevels(reader, daph, schemaPath) {
const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath)
if (maxRepetitionLevel) {
const values = new Array(daph.num_values)
readRleBitPackedHybrid(reader, bitWidth(maxRepetitionLevel), 0, values)
readRleBitPackedHybrid(reader, bitWidth(maxRepetitionLevel), values)
return values
}
}
@ -92,7 +92,7 @@ function readDefinitionLevels(reader, daph, schemaPath) {
if (!maxDefinitionLevel) return { definitionLevels: [], numNulls: 0 }
const definitionLevels = new Array(daph.num_values)
readRleBitPackedHybrid(reader, bitWidth(maxDefinitionLevel), 0, definitionLevels)
readRleBitPackedHybrid(reader, bitWidth(maxDefinitionLevel), definitionLevels)
// count nulls
let numNulls = daph.num_values
@ -173,7 +173,7 @@ export function readDataPageV2(compressedBytes, ph, columnDecoder) {
} else if (daph2.encoding === 'RLE') {
// assert(type === 'BOOLEAN')
dataPage = new Array(nValues)
readRleBitPackedHybrid(pageReader, 1, 0, dataPage)
readRleBitPackedHybrid(pageReader, 1, dataPage)
dataPage = dataPage.map(x => !!x)
} else if (
daph2.encoding === 'PLAIN_DICTIONARY' ||
@ -181,7 +181,7 @@ export function readDataPageV2(compressedBytes, ph, columnDecoder) {
) {
const bitWidth = pageView.getUint8(pageReader.offset++)
dataPage = new Array(nValues)
readRleBitPackedHybrid(pageReader, bitWidth, uncompressedPageSize - 1, dataPage)
readRleBitPackedHybrid(pageReader, bitWidth, dataPage, uncompressedPageSize - 1)
} else if (daph2.encoding === 'DELTA_BINARY_PACKED') {
const int32 = type === 'INT32'
dataPage = int32 ? new Int32Array(nValues) : new BigInt64Array(nValues)
@ -212,9 +212,7 @@ function readRepetitionLevelsV2(reader, daph2, schemaPath) {
if (!maxRepetitionLevel) return []
const values = new Array(daph2.num_values)
readRleBitPackedHybrid(
reader, bitWidth(maxRepetitionLevel), daph2.repetition_levels_byte_length, values
)
readRleBitPackedHybrid(reader, bitWidth(maxRepetitionLevel), values, daph2.repetition_levels_byte_length)
return values
}
@ -229,7 +227,7 @@ function readDefinitionLevelsV2(reader, daph2, schemaPath) {
if (maxDefinitionLevel) {
// V2 we know the length
const values = new Array(daph2.num_values)
readRleBitPackedHybrid(reader, bitWidth(maxDefinitionLevel), daph2.definition_levels_byte_length, values)
readRleBitPackedHybrid(reader, bitWidth(maxDefinitionLevel), values, daph2.definition_levels_byte_length)
return values
}
}

@ -16,12 +16,12 @@ export function bitWidth(value) {
* If length is zero, then read int32 length at the start.
*
* @param {DataReader} reader
* @param {number} width - width of each bit-packed group
* @param {number} length - length of the encoded data
* @param {number} width - bitwidth
* @param {DecodedArray} output
* @param {number} [length] - length of the encoded data
*/
export function readRleBitPackedHybrid(reader, width, length, output) {
if (!length) {
export function readRleBitPackedHybrid(reader, width, output, length) {
if (length === undefined) {
length = reader.view.getUint32(reader.offset, true)
reader.offset += 4
}

@ -14,7 +14,7 @@ describe('readRle', () => {
const reader = { view, offset: 0 }
const values = new Array(6)
readRleBitPackedHybrid(reader, 1, 4, values)
readRleBitPackedHybrid(reader, 1, values, 4)
expect(reader.offset).toBe(4)
expect(values).toEqual([1, 1, 1, 100, 100, 100])
})
@ -28,7 +28,7 @@ describe('readRle', () => {
const reader = { view, offset: 0 }
const values = new Array(3)
readRleBitPackedHybrid(reader, 16, 6, values)
readRleBitPackedHybrid(reader, 16, values, 6)
expect(reader.offset).toBe(6)
expect(values).toEqual([65535, 65535, 65535])
})
@ -44,7 +44,7 @@ describe('readRle', () => {
const reader = { view, offset: 0 }
const values = new Array(2)
readRleBitPackedHybrid(reader, 24, 4, values)
readRleBitPackedHybrid(reader, 24, values, 4)
expect(reader.offset).toBe(4)
expect(values).toEqual([16777215, 16777215])
})
@ -58,7 +58,7 @@ describe('readRle', () => {
const reader = { view, offset: 0 }
const values = new Array(3)
readRleBitPackedHybrid(reader, 32, 5, values)
readRleBitPackedHybrid(reader, 32, values, 5)
expect(reader.offset).toBe(5)
expect(values).toEqual([234000, 234000, 234000])
})
@ -75,7 +75,7 @@ describe('readBitPacked', () => {
const reader = { view, offset: 0 }
const values = new Array(3)
readRleBitPackedHybrid(reader, 1, 0, values)
readRleBitPackedHybrid(reader, 1, values)
expect(reader.offset).toBe(6)
expect(values).toEqual([0, 0, 1])
})
@ -90,7 +90,7 @@ describe('readBitPacked', () => {
const reader = { view, offset: 0 }
const values = new Array(9)
readRleBitPackedHybrid(reader, 1, 3, values)
readRleBitPackedHybrid(reader, 1, values, 3)
expect(reader.offset).toBe(3)
expect(values).toEqual([1, 1, 1, 1, 1, 1, 1, 1, 1])
})
@ -110,7 +110,7 @@ describe('readBitPacked', () => {
const reader = { view, offset: 0 }
const values = new Array(72)
readRleBitPackedHybrid(reader, 17, 154, values)
readRleBitPackedHybrid(reader, 17, values, 154)
expect(reader.offset).toBe(154)
expect(values).toEqual([
131071, 0, 0, 0, 0, 0, 0, 0,
@ -132,7 +132,7 @@ describe('readBitPacked', () => {
const reader = { view, offset: 0 }
const values = new Array(3)
expect(() => readRleBitPackedHybrid(reader, 1, 3, values))
expect(() => readRleBitPackedHybrid(reader, 1, values, 3))
.toThrow('parquet bitpack offset 1 out of range')
})
})

@ -0,0 +1,12 @@
[
[null],
[null],
[null],
[null],
[null],
[null],
[null],
[null],
[null],
[null]
]

@ -0,0 +1,42 @@
{
"version": 2,
"schema": [
{
"name": "root",
"num_children": 1
},
{
"type": "BYTE_ARRAY",
"repetition_type": "OPTIONAL",
"name": "empty"
}
],
"num_rows": 10,
"row_groups": [
{
"columns": [
{
"file_offset": 4,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": ["RLE_DICTIONARY"],
"path_in_schema": ["empty"],
"codec": "SNAPPY",
"num_values": 10,
"total_uncompressed_size": 40,
"total_compressed_size": 40,
"data_page_offset": 18,
"dictionary_page_offset": 4,
"statistics": {
"null_count": 10
}
}
}
],
"total_byte_size": 40,
"num_rows": 10
}
],
"created_by": "hyparquet",
"metadata_length": 82
}

Binary file not shown.