Delta length byte array encoding

This commit is contained in:
Kenny Daniel 2024-05-17 23:44:55 -07:00
parent da72c06ac2
commit 1689d7473a
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
6 changed files with 1080 additions and 2 deletions

@ -182,7 +182,7 @@ Parquet encodings:
- [X] BIT_PACKED
- [X] DELTA_BINARY_PACKED
- [X] DELTA_BYTE_ARRAY
- [ ] DELTA_LENGTH_BYTE_ARRAY
- [X] DELTA_LENGTH_BYTE_ARRAY
- [ ] BYTE_STREAM_SPLIT
## Hysnappy

@ -1,5 +1,5 @@
import { decompressPage } from './column.js'
import { deltaBinaryUnpack, deltaByteArray } from './delta.js'
import { deltaBinaryUnpack, deltaByteArray, deltaLengthByteArray } from './delta.js'
import { readRleBitPackedHybrid, widthFromMaxInt } from './encoding.js'
import { readPlain } from './plain.js'
import { getMaxDefinitionLevel, getMaxRepetitionLevel } from './schema.js'
@ -67,6 +67,9 @@ export function readDataPageV2(compressedBytes, ph, schemaPath, columnMetadata,
const int32 = columnMetadata.type === 'INT32'
dataPage = int32 ? new Int32Array(nValues) : new BigInt64Array(nValues)
deltaBinaryUnpack(pageReader, nValues, dataPage)
} else if (daph2.encoding === 'DELTA_LENGTH_BYTE_ARRAY') {
dataPage = new Array(nValues)
deltaLengthByteArray(pageReader, nValues, dataPage)
} else if (daph2.encoding === 'DELTA_BYTE_ARRAY') {
dataPage = new Array(nValues)
deltaByteArray(pageReader, nValues, dataPage)

@ -61,6 +61,20 @@ export function deltaBinaryUnpack(reader, nValues, output) {
}
}
/**
* @param {DataReader} reader
* @param {number} nValues
* @param {Uint8Array[]} output
*/
export function deltaLengthByteArray(reader, nValues, output) {
const lengths = new Int32Array(nValues)
deltaBinaryUnpack(reader, nValues, lengths)
for (let i = 0; i < nValues; i++) {
output[i] = new Uint8Array(reader.view.buffer, reader.view.byteOffset + reader.offset, lengths[i])
reader.offset += lengths[i]
}
}
/**
* @param {DataReader} reader
* @param {number} nValues

File diff suppressed because it is too large Load Diff

@ -0,0 +1,59 @@
{
"version": 2,
"schema": [
{
"repetition_type": "REQUIRED",
"name": "schema",
"num_children": 1,
"field_id": -1
},
{
"type": "BYTE_ARRAY",
"repetition_type": "OPTIONAL",
"name": "FRUIT",
"converted_type": "UTF8",
"field_id": 1,
"logical_type": {
"type": "STRING"
}
}
],
"num_rows": 1000,
"row_groups": [
{
"columns": [
{
"file_offset": 2629,
"meta_data": {
"type": "BYTE_ARRAY",
"encodings": [
"RLE",
"DELTA_LENGTH_BYTE_ARRAY"
],
"path_in_schema": [
"FRUIT"
],
"codec": "ZSTD",
"num_values": 1000,
"total_uncompressed_size": 23747,
"total_compressed_size": 2625,
"data_page_offset": 4,
"encoding_stats": [
{
"page_type": 3,
"encoding": "DELTA_LENGTH_BYTE_ARRAY",
"count": 1
}
]
}
}
],
"total_byte_size": 23747,
"num_rows": 1000,
"file_offset": 0,
"total_compressed_size": 2625,
"ordinal": 0
}
],
"metadata_length": 105
}

Binary file not shown.