From bf268e141cea1d5f94f5b6829e21b9d809be4cb0 Mon Sep 17 00:00:00 2001 From: Johan Levin Date: Wed, 19 Feb 2025 20:07:49 +0100 Subject: [PATCH] Use prepended length for bit-packed hybrid bool columns (#62) --- src/datapage.js | 3 +- src/encoding.js | 2 +- test/files/boolean_rle.json | 17 ++++++++ test/files/boolean_rle.metadata.json | 61 +++++++++++++++++++++++++++ test/files/boolean_rle.parquet | Bin 0 -> 432 bytes 5 files changed, 81 insertions(+), 2 deletions(-) create mode 100644 test/files/boolean_rle.json create mode 100644 test/files/boolean_rle.metadata.json create mode 100644 test/files/boolean_rle.parquet diff --git a/src/datapage.js b/src/datapage.js index 69c79af..5d0b07c 100644 --- a/src/datapage.js +++ b/src/datapage.js @@ -38,7 +38,8 @@ export function readDataPage(bytes, daph, schemaPath, { type }) { const bitWidth = type === 'BOOLEAN' ? 1 : view.getUint8(reader.offset++) if (bitWidth) { dataPage = new Array(nValues) - readRleBitPackedHybrid(reader, bitWidth, view.byteLength - reader.offset, dataPage) + const encodedLength = type === 'BOOLEAN' ? 0 : view.byteLength - reader.offset + readRleBitPackedHybrid(reader, bitWidth, encodedLength, dataPage) } else { dataPage = new Uint8Array(nValues) // nValue zeroes } diff --git a/src/encoding.js b/src/encoding.js index 5d315a3..6f1f4ab 100644 --- a/src/encoding.js +++ b/src/encoding.js @@ -22,7 +22,7 @@ export function bitWidth(value) { */ export function readRleBitPackedHybrid(reader, width, length, output) { if (!length) { - // length = reader.view.getUint32(reader.offset, true) + length = reader.view.getUint32(reader.offset, true) reader.offset += 4 } let seen = 0 diff --git a/test/files/boolean_rle.json b/test/files/boolean_rle.json new file mode 100644 index 0000000..7c29a74 --- /dev/null +++ b/test/files/boolean_rle.json @@ -0,0 +1,17 @@ +[ + [1], + [1], + [1], + [1], + [1], + [null], + [null], + [null], + [null], + [null], + [0], + [0], + [0], + [0], + [0] +] diff --git a/test/files/boolean_rle.metadata.json b/test/files/boolean_rle.metadata.json new file mode 100644 index 0000000..43c937a --- /dev/null +++ b/test/files/boolean_rle.metadata.json @@ -0,0 +1,61 @@ +{ + "created_by": "Polars", + "key_value_metadata": [ + { + "key": "ARROW:schema", + "value": "/////3YAAAAEAAAA8v///xQAAAAEAAEAAAAKAAsACAAKAAQA+P///wwAAAAIAAgAAAAEAAEAAAAEAAAA7P///ywAAAAgAAAAGAAAAAEGAAAQABIABAAQABEACAAAAAwAAAAAAPz///8EAAQADQAAAEJvb2xlYW5Db2x1bW4A" + } + ], + "metadata_length": 308, + "num_rows": 15, + "row_groups": [ + { + "columns": [ + { + "column_index_length": 17, + "column_index_offset": 89, + "crypto_metadata": 17, + "file_offset": 47, + "meta_data": { + "codec": "SNAPPY", + "data_page_offset": 4, + "encodings": [ + "RLE" + ], + "num_values": 15, + "path_in_schema": [ + "BooleanColumn" + ], + "statistics": { + "max_value": true, + "min_value": false, + "null_count": 5 + }, + "total_compressed_size": 43, + "total_uncompressed_size": 41, + "type": "BOOLEAN" + }, + "offset_index_length": 10, + "offset_index_offset": 106 + } + ], + "file_offset": 4, + "num_rows": 15, + "ordinal": 0, + "total_byte_size": 41, + "total_compressed_size": 43 + } + ], + "schema": [ + { + "name": "root", + "num_children": 1 + }, + { + "name": "BooleanColumn", + "repetition_type": "OPTIONAL", + "type": "BOOLEAN" + } + ], + "version": 1 +} diff --git a/test/files/boolean_rle.parquet b/test/files/boolean_rle.parquet new file mode 100644 index 0000000000000000000000000000000000000000..00413192c8a3356912f7d9de7d0efce474c131c0 GIT binary patch literal 432 zcma)3!AiqG5Z#R$LQz5JELm8h2nNA|)J84hWzq(%9!zpb=tW4S=pk*O(YA^oLV0cb^$Qp<3AS`M!m29A z#k5zIPm?KF!V{^;eCPb9Rc=|5JZN)j5)CGma3B+82bOGIuFk6J5iD}tv;O!o;YlTx zf2;mKPJD1^igymXjIQGOj>wvfX1eQ`rZ@RnefKb)6z&^8)X^CtMWj=APW3YRuFcUk zVvRjS!UPXu;uj0e4-oHv>x}qR^UsDi