Fix BYTE_STREAM_SPLIT with data page v2 and compression

This commit is contained in:
Kenny Daniel 2025-11-26 16:04:47 -08:00
parent 86f104357d
commit c0e0c7cfe5
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
4 changed files with 129 additions and 1 deletions

@ -193,7 +193,7 @@ export function readDataPageV2(compressedBytes, ph, columnDecoder) {
dataPage = new Array(nValues)
deltaByteArray(pageReader, nValues, dataPage)
} else if (daph2.encoding === 'BYTE_STREAM_SPLIT') {
dataPage = byteStreamSplit(reader, nValues, type, element.type_length)
dataPage = byteStreamSplit(pageReader, nValues, type, element.type_length)
} else {
throw new Error(`parquet unsupported encoding: ${daph2.encoding}`)
}

@ -0,0 +1,7 @@
[
[1.5, 10.1],
[2.5, 20.2],
[3.5, 30.3],
[4.5, 40.4],
[5.5, 50.5]
]

@ -0,0 +1,121 @@
{
"version": 2,
"schema": [
{
"repetition_type": "REQUIRED",
"name": "schema",
"num_children": 2
},
{
"type": "DOUBLE",
"repetition_type": "OPTIONAL",
"name": "float_col"
},
{
"type": "DOUBLE",
"repetition_type": "OPTIONAL",
"name": "double_col"
}
],
"num_rows": 5,
"row_groups": [
{
"columns": [
{
"file_offset": 0,
"meta_data": {
"type": "DOUBLE",
"encodings": [
"RLE",
"BYTE_STREAM_SPLIT"
],
"path_in_schema": [
"float_col"
],
"codec": "SNAPPY",
"num_values": 5,
"total_uncompressed_size": 110,
"total_compressed_size": 87,
"data_page_offset": 4,
"statistics": {
"max": 5.5,
"min": 1.5,
"null_count": 0,
"max_value": 5.5,
"min_value": 1.5,
"is_max_value_exact": true,
"is_min_value_exact": true
},
"encoding_stats": [
{
"page_type": "DATA_PAGE",
"encoding": "BYTE_STREAM_SPLIT",
"count": 1
}
],
"size_statistics": {
"repetition_level_histogram": [],
"definition_level_histogram": [
0,
5
]
}
}
},
{
"file_offset": 0,
"meta_data": {
"type": "DOUBLE",
"encodings": [
"RLE",
"BYTE_STREAM_SPLIT"
],
"path_in_schema": [
"double_col"
],
"codec": "SNAPPY",
"num_values": 5,
"total_uncompressed_size": 110,
"total_compressed_size": 97,
"data_page_offset": 91,
"statistics": {
"max": 50.5,
"min": 10.1,
"null_count": 0,
"max_value": 50.5,
"min_value": 10.1,
"is_max_value_exact": true,
"is_min_value_exact": true
},
"encoding_stats": [
{
"page_type": "DATA_PAGE",
"encoding": "BYTE_STREAM_SPLIT",
"count": 1
}
],
"size_statistics": {
"repetition_level_histogram": [],
"definition_level_histogram": [
0,
5
]
}
}
}
],
"total_byte_size": 220,
"num_rows": 5,
"file_offset": 4,
"total_compressed_size": 184
}
],
"key_value_metadata": [
{
"key": "ARROW:schema",
"value": "/////7AAAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAIAAABIAAAABAAAAND///8AAAEDEAAAABwAAAAEAAAAAAAAAAoAAABkb3VibGVfY29sAADC////AAACABAAFAAIAAYABwAMAAAAEAAQAAAAAAABAxAAAAAgAAAABAAAAAAAAAAJAAAAZmxvYXRfY29sAAYACAAGAAYAAAAAAAIAAAAAAA=="
}
],
"created_by": "parquet-cpp-arrow version 22.0.0",
"metadata_length": 576
}

Binary file not shown.