From 972402d0839dbcb719acb1f20bc75b712bf981b8 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Wed, 9 Apr 2025 16:38:18 -0700 Subject: [PATCH] Fix handling of dictionary pages from parquet.net --- src/column.js | 45 +++++++++++-------------- src/convert.js | 2 -- test/files/issue72.json | 5 +++ test/files/issue72.metadata.json | 56 +++++++++++++++++++++++++++++++ test/files/issue72.parquet | Bin 0 -> 621 bytes 5 files changed, 81 insertions(+), 27 deletions(-) create mode 100644 test/files/issue72.json create mode 100644 test/files/issue72.metadata.json create mode 100644 test/files/issue72.parquet diff --git a/src/column.js b/src/column.js index 8f9b952..be15467 100644 --- a/src/column.js +++ b/src/column.js @@ -1,6 +1,6 @@ import { assembleLists } from './assemble.js' import { Encoding, PageType } from './constants.js' -import { convertWithDictionary } from './convert.js' +import { convert, convertWithDictionary } from './convert.js' import { decompressPage, readDataPage, readDataPageV2 } from './datapage.js' import { readPlain } from './plain.js' import { isFlatColumn } from './schema.js' @@ -25,22 +25,26 @@ export function readColumn(reader, rowGroupStart, rowGroupEnd, columnMetadata, s let dictionary = undefined let rowCount = 0 - // read dictionary - if (hasDictionary(columnMetadata)) { - dictionary = readPage(reader, columnMetadata, schemaPath, element, dictionary, undefined, 0, options) - } - while (rowCount < rowGroupEnd) { if (reader.offset >= reader.view.byteLength - 1) break // end of reader - const lastChunk = chunks.at(-1) - const lastChunkLength = lastChunk ? lastChunk.length : 0 - const values = readPage(reader, columnMetadata, schemaPath, element, dictionary, lastChunk, rowGroupStart - rowCount, options) - if (lastChunk === values) { - // continued from previous page - rowCount += values.length - lastChunkLength + + // read page header + const header = parquetHeader(reader) + if (header.type === 'DICTIONARY_PAGE') { + // assert(!dictionary) + dictionary = readPage(reader, header, columnMetadata, schemaPath, element, dictionary, undefined, 0, options) + dictionary = convert(dictionary, element, options.utf8) } else { - chunks.push(values) - rowCount += values.length + const lastChunk = chunks.at(-1) + const lastChunkLength = lastChunk?.length || 0 + const values = readPage(reader, header, columnMetadata, schemaPath, element, dictionary, lastChunk, rowGroupStart - rowCount, options) + if (lastChunk === values) { + // continued from previous page + rowCount += values.length - lastChunkLength + } else { + chunks.push(values) + rowCount += values.length + } } } if (isFinite(rowGroupEnd)) { @@ -60,6 +64,7 @@ export function readColumn(reader, rowGroupStart, rowGroupEnd, columnMetadata, s * Read a page (data or dictionary) from a buffer. * * @param {DataReader} reader + * @param {PageHeader} header * @param {ColumnMetaData} columnMetadata * @param {SchemaTree[]} schemaPath * @param {SchemaElement} element @@ -69,9 +74,7 @@ export function readColumn(reader, rowGroupStart, rowGroupEnd, columnMetadata, s * @param {ParquetReadOptions} options * @returns {DecodedArray} */ -export function readPage(reader, columnMetadata, schemaPath, element, dictionary, previousChunk, pageStart, { utf8, compressors }) { - const header = parquetHeader(reader) // column header - +export function readPage(reader, header, columnMetadata, schemaPath, element, dictionary, previousChunk, pageStart, { utf8, compressors }) { // read compressed_page_size bytes const compressedBytes = new Uint8Array( reader.view.buffer, reader.view.byteOffset + reader.offset, header.compressed_page_size @@ -138,14 +141,6 @@ export function readPage(reader, columnMetadata, schemaPath, element, dictionary } } -/** - * @param {ColumnMetaData} columnMetadata - * @returns {boolean} - */ -function hasDictionary(columnMetadata) { - return columnMetadata.encodings.some(e => e.endsWith('_DICTIONARY')) -} - /** * Find the start byte offset for a column chunk. * diff --git a/src/convert.js b/src/convert.js index 3db468e..262c2ea 100644 --- a/src/convert.js +++ b/src/convert.js @@ -13,8 +13,6 @@ const dayMillis = 86400000 // 1 day in milliseconds */ export function convertWithDictionary(data, dictionary, schemaElement, encoding, utf8 = true) { if (dictionary && encoding.endsWith('_DICTIONARY')) { - // convert dictionary - dictionary = convert(dictionary, schemaElement, utf8) let output = data if (data instanceof Uint8Array && !(dictionary instanceof Uint8Array)) { // @ts-expect-error upgrade data to match dictionary type with fancy constructor diff --git a/test/files/issue72.json b/test/files/issue72.json new file mode 100644 index 0000000..1993859 --- /dev/null +++ b/test/files/issue72.json @@ -0,0 +1,5 @@ +[ + ["258d7fff-6418-499f-af07-c6611937d7d8"], + ["086f2968-327b-48a8-8cdf-64f46bcd8173"], + ["258d7fff-6418-499f-af07-c6611937d7d8"] +] diff --git a/test/files/issue72.metadata.json b/test/files/issue72.metadata.json new file mode 100644 index 0000000..32923a3 --- /dev/null +++ b/test/files/issue72.metadata.json @@ -0,0 +1,56 @@ +{ + "version": 1, + "schema": [ + { + "name": "root", + "num_children": 1 + }, + { + "type": "BYTE_ARRAY", + "repetition_type": "OPTIONAL", + "name": "TextColumn", + "converted_type": "UTF8", + "logical_type": { + "type": "STRING" + } + } + ], + "num_rows": 3, + "row_groups": [ + { + "columns": [ + { + "file_offset": 4, + "meta_data": { + "type": "BYTE_ARRAY", + "encodings": [ + "RLE", + "BIT_PACKED", + "PLAIN" + ], + "path_in_schema": [ + "TextColumn" + ], + "codec": "SNAPPY", + "num_values": 3, + "total_uncompressed_size": 283, + "total_compressed_size": 288, + "data_page_offset": 4, + "statistics": { + "max": "258d7fff-6418-499f-af07-c6611937d7d8", + "min": "086f2968-327b-48a8-8cdf-64f46bcd8173", + "null_count": 0, + "distinct_count": 2, + "max_value": "258d7fff-6418-499f-af07-c6611937d7d8", + "min_value": "086f2968-327b-48a8-8cdf-64f46bcd8173" + } + } + } + ], + "total_byte_size": 288, + "num_rows": 3 + } + ], + "created_by": "Parquet.Net version 4.25.0 (build 687fbb462e94eddd1dc5a0aa26f33ba8e53f60e3)", + "metadata_length": 321 +} diff --git a/test/files/issue72.parquet b/test/files/issue72.parquet new file mode 100644 index 0000000000000000000000000000000000000000..31d1a30d3e2c72bfb8f4d4cef8a2a3d015c539d6 GIT binary patch literal 621 zcmcIiK}*9h7)?V7L*_AIl_EWKW5QC}CQVZCGQ0>5E#en7 zV0QDk!@PWaAH4Uz_uw_l#{!4=Bfy`*$}b@R&Cf#$04OVEq;rl6UMR}z+EMLtNjo8m zqGr-aqYk)S30KxaQC7;9a-|hjojK4P7j0)$AsGhv0$(mL@*faJ@gzM)eUcudO#lf` zsplh|=g?0SKnP#}0fy1yHr)07fc-I^!pYf_eIIW6&Hi~i1*_PD35wz