From 5eeb05da4061d98ad2eb2de72e7b24124292b640 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Tue, 21 May 2024 22:50:50 -0700 Subject: [PATCH] dict-page-offset-zero.parquet --- src/column.js | 10 +- test/files/dict-page-offset-zero.json | 41 ++++++++ .../files/dict-page-offset-zero.metadata.json | 94 ++++++++++++++++++ test/files/dict-page-offset-zero.parquet | Bin 0 -> 635 bytes 4 files changed, 140 insertions(+), 5 deletions(-) create mode 100644 test/files/dict-page-offset-zero.json create mode 100644 test/files/dict-page-offset-zero.metadata.json create mode 100644 test/files/dict-page-offset-zero.parquet diff --git a/src/column.js b/src/column.js index c79542f..f4f7089 100644 --- a/src/column.js +++ b/src/column.js @@ -28,14 +28,14 @@ import { concat } from './utils.js' export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata, schemaPath, compressors) { /** @type {ArrayLike | undefined} */ let dictionary = undefined - let valuesSeen = 0 + let seen = 0 /** @type {any[]} */ const rowData = [] const { element } = schemaPath[schemaPath.length - 1] // column reader: const reader = { view: new DataView(arrayBuffer, columnOffset), offset: 0 } - while (valuesSeen < rowGroup.num_rows) { + while (seen < rowGroup.num_rows) { // parse column header const header = parquetHeader(reader) if (header.compressed_page_size === undefined) { @@ -58,7 +58,7 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata, compressedBytes, Number(header.uncompressed_page_size), columnMetadata.codec, compressors ) const { definitionLevels, repetitionLevels, dataPage } = readDataPage(page, daph, schemaPath, columnMetadata) - valuesSeen += daph.num_values + seen += daph.num_values // assert(!daph.statistics || daph.statistics.null_count === BigInt(daph.num_values - dataPage.length)) // construct output values: skip nulls and construct lists @@ -89,7 +89,7 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata, const { definitionLevels, repetitionLevels, dataPage } = readDataPageV2( compressedBytes, header, schemaPath, columnMetadata, compressors ) - valuesSeen += daph2.num_values + seen += daph2.num_values dereferenceDictionary(dictionary, dataPage) values = convert(dataPage, element) @@ -145,7 +145,7 @@ function dereferenceDictionary(dictionary, dataPage) { */ export function getColumnOffset({ dictionary_page_offset, data_page_offset }) { let columnOffset = dictionary_page_offset - if (dictionary_page_offset === undefined || data_page_offset < dictionary_page_offset) { + if (!dictionary_page_offset || data_page_offset < dictionary_page_offset) { columnOffset = data_page_offset } return Number(columnOffset) diff --git a/test/files/dict-page-offset-zero.json b/test/files/dict-page-offset-zero.json new file mode 100644 index 0000000..5e5d65e --- /dev/null +++ b/test/files/dict-page-offset-zero.json @@ -0,0 +1,41 @@ +[ + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552], + [1552] +] diff --git a/test/files/dict-page-offset-zero.metadata.json b/test/files/dict-page-offset-zero.metadata.json new file mode 100644 index 0000000..f3e642f --- /dev/null +++ b/test/files/dict-page-offset-zero.metadata.json @@ -0,0 +1,94 @@ +{ + "version": 1, + "schema": [ + { + "name": "root", + "num_children": 1 + }, + { + "type": "INT32", + "repetition_type": "OPTIONAL", + "name": "l_partkey" + } + ], + "num_rows": 39, + "row_groups": [ + { + "columns": [ + { + "file_offset": 4, + "meta_data": { + "type": "INT32", + "encodings": [ + "PLAIN", + "BIT_PACKED", + "RLE" + ], + "path_in_schema": [ + "l_partkey" + ], + "codec": "SNAPPY", + "num_values": 39, + "total_uncompressed_size": 180, + "total_compressed_size": 40, + "data_page_offset": 4, + "dictionary_page_offset": 0, + "statistics": { + "max": 1552, + "min": 1552, + "null_count": 0, + "max_value": 1552, + "min_value": 1552 + }, + "encoding_stats": [ + { + "page_type": 0, + "encoding": "PLAIN", + "count": 1 + } + ], + "bloom_filter_length": [ + { + "field_1": { + "field_1": 0, + "field_2": 162, + "field_3": 22, + "field_5": { + "field_1": 39, + "field_2": 0, + "field_3": 3, + "field_4": 4 + } + }, + "field_2": 22 + } + ] + }, + "offset_index_offset": 67, + "offset_index_length": 10, + "column_index_offset": 44, + "column_index_length": 23, + "crypto_metadata": 23 + } + ], + "total_byte_size": 180, + "num_rows": 39 + } + ], + "key_value_metadata": [ + { + "key": "is.date.correct", + "value": "true" + }, + { + "key": "dremio.arrow.schema.2.1", + "value": "{\n \"fields\" : [ {\n \"name\" : \"l_partkey\",\n \"nullable\" : true,\n \"type\" : {\n \"name\" : \"int\",\n \"bitWidth\" : 32,\n \"isSigned\" : true\n },\n \"children\" : [ ]\n } ]\n}" + }, + { + "key": "dremio.version", + "value": "3.2.0-201905102005330382-0598733" + } + ], + "created_by": "parquet-mr version 1.12.0-201812210311360288-a86293f (build cec1a483e9dcd545e09170ae787d3dcb13744433)", + "metadata_length": 550 +} diff --git a/test/files/dict-page-offset-zero.parquet b/test/files/dict-page-offset-zero.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f9dbd7fd5e286b9b7ab945e9cdb0a0a8ea6ed252 GIT binary patch literal 635 zcmZ8fO^eh(5UmMeh4z*2o5aTsTAKh&9=x_y*J)&#)Y!iH0tj=$W~8UgKngLCu_YOym>Tg4J`Y&@^dTd z0Uk_`r4EyIwUv#TpdA~$G~9t)o*ck^IL-A6BicB)XLN>`V7DbG5#`pz|vTRjm7g5fy$k}FZS|F9ESQ%$r