mirror of
https://github.com/asadbek064/hyparquet.git
synced 2025-12-05 22:41:55 +00:00
Fix bug when encoding length is zero (#93)
This commit is contained in:
parent
2eb793c30f
commit
ef8e1c8c71
@ -37,11 +37,11 @@ export function readDataPage(bytes, daph, { type, element, schemaPath }) {
|
||||
if (bitWidth) {
|
||||
dataPage = new Array(nValues)
|
||||
if (type === 'BOOLEAN') {
|
||||
readRleBitPackedHybrid(reader, bitWidth, 0, dataPage)
|
||||
readRleBitPackedHybrid(reader, bitWidth, dataPage)
|
||||
dataPage = dataPage.map(x => !!x) // convert to boolean
|
||||
} else {
|
||||
// assert(daph.encoding.endsWith('_DICTIONARY'))
|
||||
readRleBitPackedHybrid(reader, bitWidth, view.byteLength - reader.offset, dataPage)
|
||||
readRleBitPackedHybrid(reader, bitWidth, dataPage, view.byteLength - reader.offset)
|
||||
}
|
||||
} else {
|
||||
dataPage = new Uint8Array(nValues) // nValue zeroes
|
||||
@ -74,7 +74,7 @@ function readRepetitionLevels(reader, daph, schemaPath) {
|
||||
const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath)
|
||||
if (maxRepetitionLevel) {
|
||||
const values = new Array(daph.num_values)
|
||||
readRleBitPackedHybrid(reader, bitWidth(maxRepetitionLevel), 0, values)
|
||||
readRleBitPackedHybrid(reader, bitWidth(maxRepetitionLevel), values)
|
||||
return values
|
||||
}
|
||||
}
|
||||
@ -92,7 +92,7 @@ function readDefinitionLevels(reader, daph, schemaPath) {
|
||||
if (!maxDefinitionLevel) return { definitionLevels: [], numNulls: 0 }
|
||||
|
||||
const definitionLevels = new Array(daph.num_values)
|
||||
readRleBitPackedHybrid(reader, bitWidth(maxDefinitionLevel), 0, definitionLevels)
|
||||
readRleBitPackedHybrid(reader, bitWidth(maxDefinitionLevel), definitionLevels)
|
||||
|
||||
// count nulls
|
||||
let numNulls = daph.num_values
|
||||
@ -173,7 +173,7 @@ export function readDataPageV2(compressedBytes, ph, columnDecoder) {
|
||||
} else if (daph2.encoding === 'RLE') {
|
||||
// assert(type === 'BOOLEAN')
|
||||
dataPage = new Array(nValues)
|
||||
readRleBitPackedHybrid(pageReader, 1, 0, dataPage)
|
||||
readRleBitPackedHybrid(pageReader, 1, dataPage)
|
||||
dataPage = dataPage.map(x => !!x)
|
||||
} else if (
|
||||
daph2.encoding === 'PLAIN_DICTIONARY' ||
|
||||
@ -181,7 +181,7 @@ export function readDataPageV2(compressedBytes, ph, columnDecoder) {
|
||||
) {
|
||||
const bitWidth = pageView.getUint8(pageReader.offset++)
|
||||
dataPage = new Array(nValues)
|
||||
readRleBitPackedHybrid(pageReader, bitWidth, uncompressedPageSize - 1, dataPage)
|
||||
readRleBitPackedHybrid(pageReader, bitWidth, dataPage, uncompressedPageSize - 1)
|
||||
} else if (daph2.encoding === 'DELTA_BINARY_PACKED') {
|
||||
const int32 = type === 'INT32'
|
||||
dataPage = int32 ? new Int32Array(nValues) : new BigInt64Array(nValues)
|
||||
@ -212,9 +212,7 @@ function readRepetitionLevelsV2(reader, daph2, schemaPath) {
|
||||
if (!maxRepetitionLevel) return []
|
||||
|
||||
const values = new Array(daph2.num_values)
|
||||
readRleBitPackedHybrid(
|
||||
reader, bitWidth(maxRepetitionLevel), daph2.repetition_levels_byte_length, values
|
||||
)
|
||||
readRleBitPackedHybrid(reader, bitWidth(maxRepetitionLevel), values, daph2.repetition_levels_byte_length)
|
||||
return values
|
||||
}
|
||||
|
||||
@ -229,7 +227,7 @@ function readDefinitionLevelsV2(reader, daph2, schemaPath) {
|
||||
if (maxDefinitionLevel) {
|
||||
// V2 we know the length
|
||||
const values = new Array(daph2.num_values)
|
||||
readRleBitPackedHybrid(reader, bitWidth(maxDefinitionLevel), daph2.definition_levels_byte_length, values)
|
||||
readRleBitPackedHybrid(reader, bitWidth(maxDefinitionLevel), values, daph2.definition_levels_byte_length)
|
||||
return values
|
||||
}
|
||||
}
|
||||
|
||||
@ -16,12 +16,12 @@ export function bitWidth(value) {
|
||||
* If length is zero, then read int32 length at the start.
|
||||
*
|
||||
* @param {DataReader} reader
|
||||
* @param {number} width - width of each bit-packed group
|
||||
* @param {number} length - length of the encoded data
|
||||
* @param {number} width - bitwidth
|
||||
* @param {DecodedArray} output
|
||||
* @param {number} [length] - length of the encoded data
|
||||
*/
|
||||
export function readRleBitPackedHybrid(reader, width, length, output) {
|
||||
if (!length) {
|
||||
export function readRleBitPackedHybrid(reader, width, output, length) {
|
||||
if (length === undefined) {
|
||||
length = reader.view.getUint32(reader.offset, true)
|
||||
reader.offset += 4
|
||||
}
|
||||
|
||||
@ -14,7 +14,7 @@ describe('readRle', () => {
|
||||
const reader = { view, offset: 0 }
|
||||
|
||||
const values = new Array(6)
|
||||
readRleBitPackedHybrid(reader, 1, 4, values)
|
||||
readRleBitPackedHybrid(reader, 1, values, 4)
|
||||
expect(reader.offset).toBe(4)
|
||||
expect(values).toEqual([1, 1, 1, 100, 100, 100])
|
||||
})
|
||||
@ -28,7 +28,7 @@ describe('readRle', () => {
|
||||
const reader = { view, offset: 0 }
|
||||
|
||||
const values = new Array(3)
|
||||
readRleBitPackedHybrid(reader, 16, 6, values)
|
||||
readRleBitPackedHybrid(reader, 16, values, 6)
|
||||
expect(reader.offset).toBe(6)
|
||||
expect(values).toEqual([65535, 65535, 65535])
|
||||
})
|
||||
@ -44,7 +44,7 @@ describe('readRle', () => {
|
||||
const reader = { view, offset: 0 }
|
||||
|
||||
const values = new Array(2)
|
||||
readRleBitPackedHybrid(reader, 24, 4, values)
|
||||
readRleBitPackedHybrid(reader, 24, values, 4)
|
||||
expect(reader.offset).toBe(4)
|
||||
expect(values).toEqual([16777215, 16777215])
|
||||
})
|
||||
@ -58,7 +58,7 @@ describe('readRle', () => {
|
||||
const reader = { view, offset: 0 }
|
||||
|
||||
const values = new Array(3)
|
||||
readRleBitPackedHybrid(reader, 32, 5, values)
|
||||
readRleBitPackedHybrid(reader, 32, values, 5)
|
||||
expect(reader.offset).toBe(5)
|
||||
expect(values).toEqual([234000, 234000, 234000])
|
||||
})
|
||||
@ -75,7 +75,7 @@ describe('readBitPacked', () => {
|
||||
const reader = { view, offset: 0 }
|
||||
|
||||
const values = new Array(3)
|
||||
readRleBitPackedHybrid(reader, 1, 0, values)
|
||||
readRleBitPackedHybrid(reader, 1, values)
|
||||
expect(reader.offset).toBe(6)
|
||||
expect(values).toEqual([0, 0, 1])
|
||||
})
|
||||
@ -90,7 +90,7 @@ describe('readBitPacked', () => {
|
||||
const reader = { view, offset: 0 }
|
||||
|
||||
const values = new Array(9)
|
||||
readRleBitPackedHybrid(reader, 1, 3, values)
|
||||
readRleBitPackedHybrid(reader, 1, values, 3)
|
||||
expect(reader.offset).toBe(3)
|
||||
expect(values).toEqual([1, 1, 1, 1, 1, 1, 1, 1, 1])
|
||||
})
|
||||
@ -110,7 +110,7 @@ describe('readBitPacked', () => {
|
||||
const reader = { view, offset: 0 }
|
||||
|
||||
const values = new Array(72)
|
||||
readRleBitPackedHybrid(reader, 17, 154, values)
|
||||
readRleBitPackedHybrid(reader, 17, values, 154)
|
||||
expect(reader.offset).toBe(154)
|
||||
expect(values).toEqual([
|
||||
131071, 0, 0, 0, 0, 0, 0, 0,
|
||||
@ -132,7 +132,7 @@ describe('readBitPacked', () => {
|
||||
const reader = { view, offset: 0 }
|
||||
|
||||
const values = new Array(3)
|
||||
expect(() => readRleBitPackedHybrid(reader, 1, 3, values))
|
||||
expect(() => readRleBitPackedHybrid(reader, 1, values, 3))
|
||||
.toThrow('parquet bitpack offset 1 out of range')
|
||||
})
|
||||
})
|
||||
|
||||
12
test/files/mostlyempty.json
Normal file
12
test/files/mostlyempty.json
Normal file
@ -0,0 +1,12 @@
|
||||
[
|
||||
[null],
|
||||
[null],
|
||||
[null],
|
||||
[null],
|
||||
[null],
|
||||
[null],
|
||||
[null],
|
||||
[null],
|
||||
[null],
|
||||
[null]
|
||||
]
|
||||
42
test/files/mostlyempty.metadata.json
Normal file
42
test/files/mostlyempty.metadata.json
Normal file
@ -0,0 +1,42 @@
|
||||
{
|
||||
"version": 2,
|
||||
"schema": [
|
||||
{
|
||||
"name": "root",
|
||||
"num_children": 1
|
||||
},
|
||||
{
|
||||
"type": "BYTE_ARRAY",
|
||||
"repetition_type": "OPTIONAL",
|
||||
"name": "empty"
|
||||
}
|
||||
],
|
||||
"num_rows": 10,
|
||||
"row_groups": [
|
||||
{
|
||||
"columns": [
|
||||
{
|
||||
"file_offset": 4,
|
||||
"meta_data": {
|
||||
"type": "BYTE_ARRAY",
|
||||
"encodings": ["RLE_DICTIONARY"],
|
||||
"path_in_schema": ["empty"],
|
||||
"codec": "SNAPPY",
|
||||
"num_values": 10,
|
||||
"total_uncompressed_size": 40,
|
||||
"total_compressed_size": 40,
|
||||
"data_page_offset": 18,
|
||||
"dictionary_page_offset": 4,
|
||||
"statistics": {
|
||||
"null_count": 10
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"total_byte_size": 40,
|
||||
"num_rows": 10
|
||||
}
|
||||
],
|
||||
"created_by": "hyparquet",
|
||||
"metadata_length": 82
|
||||
}
|
||||
BIN
test/files/mostlyempty.parquet
Normal file
BIN
test/files/mostlyempty.parquet
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user