From 1f8289b4b29b796e38fbbaa9a764add2b6c4efcb Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Wed, 22 May 2024 18:23:13 -0700 Subject: [PATCH] rle_boolean_encoding.parquet --- src/datapageV2.js | 6 ++ src/encoding.js | 1 + test/files/rle_boolean_encoding.json | 70 ++++++++++++++++++ test/files/rle_boolean_encoding.metadata.json | 49 ++++++++++++ test/files/rle_boolean_encoding.parquet | Bin 0 -> 192 bytes 5 files changed, 126 insertions(+) create mode 100644 test/files/rle_boolean_encoding.json create mode 100644 test/files/rle_boolean_encoding.metadata.json create mode 100644 test/files/rle_boolean_encoding.parquet diff --git a/src/datapageV2.js b/src/datapageV2.js index ff4c0a6..ccfab31 100644 --- a/src/datapageV2.js +++ b/src/datapageV2.js @@ -3,6 +3,7 @@ import { deltaBinaryUnpack, deltaByteArray, deltaLengthByteArray } from './delta import { bitWidth, byteStreamSplit, readRleBitPackedHybrid } from './encoding.js' import { readPlain } from './plain.js' import { getMaxDefinitionLevel, getMaxRepetitionLevel } from './schema.js' +import { readVarInt } from './thrift.js' /** * Read a data page from the given Uint8Array. @@ -31,6 +32,11 @@ export function readDataPageV2(compressedBytes, ph, schemaPath, columnMetadata, // assert(reader.offset === daph2.repetition_levels_byte_length) // definition levels + const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath) + if (columnMetadata.type === 'BOOLEAN' && maxDefinitionLevel) { + // special case for boolean data page v2 + readVarInt(reader) // assert(=== num_values) + } const definitionLevels = readDefinitionLevelsV2(reader, daph2, schemaPath) // assert(reader.offset === daph2.repetition_levels_byte_length + daph2.definition_levels_byte_length) diff --git a/src/encoding.js b/src/encoding.js index 1a1acdd..6c80558 100644 --- a/src/encoding.js +++ b/src/encoding.js @@ -57,6 +57,7 @@ function readRle(reader, count, bitWidth, output, seen) { let value = 0 if (width === 1) { value = reader.view.getUint8(reader.offset) + // assert(value < 1 << bitWidth) } else if (width === 2) { value = reader.view.getUint16(reader.offset, true) } else if (width === 4) { diff --git a/test/files/rle_boolean_encoding.json b/test/files/rle_boolean_encoding.json new file mode 100644 index 0000000..7d03549 --- /dev/null +++ b/test/files/rle_boolean_encoding.json @@ -0,0 +1,70 @@ +[ + [true], + [false], + [null], + [true], + [true], + [false], + [false], + [true], + [true], + [true], + [false], + [false], + [true], + [true], + [false], + [null], + [true], + [true], + [false], + [false], + [true], + [true], + [false], + [null], + [true], + [true], + [false], + [false], + [true], + [true], + [true], + [false], + [false], + [false], + [false], + [true], + [true], + [false], + [null], + [true], + [true], + [false], + [false], + [true], + [true], + [true], + [false], + [false], + [null], + [true], + [true], + [false], + [false], + [true], + [true], + [true], + [false], + [true], + [true], + [false], + [null], + [true], + [true], + [false], + [false], + [true], + [true], + [true] +] diff --git a/test/files/rle_boolean_encoding.metadata.json b/test/files/rle_boolean_encoding.metadata.json new file mode 100644 index 0000000..a8ceaa3 --- /dev/null +++ b/test/files/rle_boolean_encoding.metadata.json @@ -0,0 +1,49 @@ +{ + "version": 1, + "schema": [ + { + "name": "table", + "num_children": 1 + }, + { + "type": "BOOLEAN", + "repetition_type": "OPTIONAL", + "name": "datatype_boolean", + "field_id": 1 + } + ], + "num_rows": 68, + "row_groups": [ + { + "columns": [ + { + "file_offset": 0, + "meta_data": { + "type": "BOOLEAN", + "encodings": [ + "RLE" + ], + "path_in_schema": [ + "datatype_boolean" + ], + "codec": "GZIP", + "num_values": 68, + "total_uncompressed_size": 49, + "total_compressed_size": 69, + "data_page_offset": 4, + "statistics": { + "max": true, + "min": false, + "null_count": 6, + "max_value": true, + "min_value": false + } + } + } + ], + "total_byte_size": 69, + "num_rows": 68 + } + ], + "metadata_length": 111 +} diff --git a/test/files/rle_boolean_encoding.parquet b/test/files/rle_boolean_encoding.parquet new file mode 100644 index 0000000000000000000000000000000000000000..6a6de0a9422bb42b08139f58a5b18f1df4ed6ed6 GIT binary patch literal 192 zcmWG=3^EjD6EzWyi4pB!6y*UCY@%YKEDQ`CjO@Sb>tz@@|NY)?FW=3<00ECv5)uS# zs*T(8w|y-0W+?vTIfWM}Bg!PH