From c0e0c7cfe51aa6fcbf566e6ab365e650491d01a4 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Wed, 26 Nov 2025 16:04:47 -0800 Subject: [PATCH] Fix BYTE_STREAM_SPLIT with data page v2 and compression --- src/datapage.js | 2 +- test/files/byte_stream_split_v2.json | 7 + test/files/byte_stream_split_v2.metadata.json | 121 ++++++++++++++++++ test/files/byte_stream_split_v2.parquet | Bin 0 -> 772 bytes 4 files changed, 129 insertions(+), 1 deletion(-) create mode 100644 test/files/byte_stream_split_v2.json create mode 100644 test/files/byte_stream_split_v2.metadata.json create mode 100644 test/files/byte_stream_split_v2.parquet diff --git a/src/datapage.js b/src/datapage.js index 9749fdf..fe4c28b 100644 --- a/src/datapage.js +++ b/src/datapage.js @@ -193,7 +193,7 @@ export function readDataPageV2(compressedBytes, ph, columnDecoder) { dataPage = new Array(nValues) deltaByteArray(pageReader, nValues, dataPage) } else if (daph2.encoding === 'BYTE_STREAM_SPLIT') { - dataPage = byteStreamSplit(reader, nValues, type, element.type_length) + dataPage = byteStreamSplit(pageReader, nValues, type, element.type_length) } else { throw new Error(`parquet unsupported encoding: ${daph2.encoding}`) } diff --git a/test/files/byte_stream_split_v2.json b/test/files/byte_stream_split_v2.json new file mode 100644 index 0000000..20f8c6c --- /dev/null +++ b/test/files/byte_stream_split_v2.json @@ -0,0 +1,7 @@ +[ + [1.5, 10.1], + [2.5, 20.2], + [3.5, 30.3], + [4.5, 40.4], + [5.5, 50.5] +] diff --git a/test/files/byte_stream_split_v2.metadata.json b/test/files/byte_stream_split_v2.metadata.json new file mode 100644 index 0000000..946a709 --- /dev/null +++ b/test/files/byte_stream_split_v2.metadata.json @@ -0,0 +1,121 @@ +{ + "version": 2, + "schema": [ + { + "repetition_type": "REQUIRED", + "name": "schema", + "num_children": 2 + }, + { + "type": "DOUBLE", + "repetition_type": "OPTIONAL", + "name": "float_col" + }, + { + "type": "DOUBLE", + "repetition_type": "OPTIONAL", + "name": "double_col" + } + ], + "num_rows": 5, + "row_groups": [ + { + "columns": [ + { + "file_offset": 0, + "meta_data": { + "type": "DOUBLE", + "encodings": [ + "RLE", + "BYTE_STREAM_SPLIT" + ], + "path_in_schema": [ + "float_col" + ], + "codec": "SNAPPY", + "num_values": 5, + "total_uncompressed_size": 110, + "total_compressed_size": 87, + "data_page_offset": 4, + "statistics": { + "max": 5.5, + "min": 1.5, + "null_count": 0, + "max_value": 5.5, + "min_value": 1.5, + "is_max_value_exact": true, + "is_min_value_exact": true + }, + "encoding_stats": [ + { + "page_type": "DATA_PAGE", + "encoding": "BYTE_STREAM_SPLIT", + "count": 1 + } + ], + "size_statistics": { + "repetition_level_histogram": [], + "definition_level_histogram": [ + 0, + 5 + ] + } + } + }, + { + "file_offset": 0, + "meta_data": { + "type": "DOUBLE", + "encodings": [ + "RLE", + "BYTE_STREAM_SPLIT" + ], + "path_in_schema": [ + "double_col" + ], + "codec": "SNAPPY", + "num_values": 5, + "total_uncompressed_size": 110, + "total_compressed_size": 97, + "data_page_offset": 91, + "statistics": { + "max": 50.5, + "min": 10.1, + "null_count": 0, + "max_value": 50.5, + "min_value": 10.1, + "is_max_value_exact": true, + "is_min_value_exact": true + }, + "encoding_stats": [ + { + "page_type": "DATA_PAGE", + "encoding": "BYTE_STREAM_SPLIT", + "count": 1 + } + ], + "size_statistics": { + "repetition_level_histogram": [], + "definition_level_histogram": [ + 0, + 5 + ] + } + } + } + ], + "total_byte_size": 220, + "num_rows": 5, + "file_offset": 4, + "total_compressed_size": 184 + } + ], + "key_value_metadata": [ + { + "key": "ARROW:schema", + "value": "/////7AAAAAQAAAAAAAKAAwABgAFAAgACgAAAAABBAAMAAAACAAIAAAABAAIAAAABAAAAAIAAABIAAAABAAAAND///8AAAEDEAAAABwAAAAEAAAAAAAAAAoAAABkb3VibGVfY29sAADC////AAACABAAFAAIAAYABwAMAAAAEAAQAAAAAAABAxAAAAAgAAAABAAAAAAAAAAJAAAAZmxvYXRfY29sAAYACAAGAAYAAAAAAAIAAAAAAA==" + } + ], + "created_by": "parquet-cpp-arrow version 22.0.0", + "metadata_length": 576 +} diff --git a/test/files/byte_stream_split_v2.parquet b/test/files/byte_stream_split_v2.parquet new file mode 100644 index 0000000000000000000000000000000000000000..b0fdf54a15809b5457783b3578ccf23c49e33c60 GIT binary patch literal 772 zcma)4O-~|05UrU-!cM$E(w*sK4jgczi66+U@guNI4XD9yiBU0|7(isr;tmcf{wj~g zvuFJW{s@m8t!^B}1QTDTyI<9GzgJ!99W2kXDBEN9A7(J1kzf%9am$Q>0(V2!%X1hX zl8wh{qoIuh1RA7zkG+fp#k*uSTa^Gb?WkXrZah@D-oX52hNN+f9>eugL zN$R^SBC>N}MmzPtjq@stQ1xTF`SY~pS1*t2eyblFC;nBf)ldm;2un=bU@=3CMH6CR zGt;@jpEZ7~*|BVRE5U=ym0+s3eF!4ewvg**~kT?w?jNiyg#bL6$;43R0ZRNv|T0DqlJr z*5~1k)S%)Bv^B( literal 0 HcmV?d00001