From 972402d0839dbcb719acb1f20bc75b712bf981b8 Mon Sep 17 00:00:00 2001
From: Kenny Daniel <platypii@gmail.com>
Date: Wed, 9 Apr 2025 16:38:18 -0700
Subject: [PATCH] Fix handling of dictionary pages from parquet.net

---
 src/column.js                    |  45 +++++++++++--------------
 src/convert.js                   |   2 --
 test/files/issue72.json          |   5 +++
 test/files/issue72.metadata.json |  56 +++++++++++++++++++++++++++++++
 test/files/issue72.parquet       | Bin 0 -> 621 bytes
 5 files changed, 81 insertions(+), 27 deletions(-)
 create mode 100644 test/files/issue72.json
 create mode 100644 test/files/issue72.metadata.json
 create mode 100644 test/files/issue72.parquet

diff --git a/src/column.js b/src/column.js
index 8f9b952..be15467 100644
--- a/src/column.js
+++ b/src/column.js
@@ -1,6 +1,6 @@
 import { assembleLists } from './assemble.js'
 import { Encoding, PageType } from './constants.js'
-import { convertWithDictionary } from './convert.js'
+import { convert, convertWithDictionary } from './convert.js'
 import { decompressPage, readDataPage, readDataPageV2 } from './datapage.js'
 import { readPlain } from './plain.js'
 import { isFlatColumn } from './schema.js'
@@ -25,22 +25,26 @@ export function readColumn(reader, rowGroupStart, rowGroupEnd, columnMetadata, s
   let dictionary = undefined
   let rowCount = 0
 
-  // read dictionary
-  if (hasDictionary(columnMetadata)) {
-    dictionary = readPage(reader, columnMetadata, schemaPath, element, dictionary, undefined, 0, options)
-  }
-
   while (rowCount < rowGroupEnd) {
     if (reader.offset >= reader.view.byteLength - 1) break // end of reader
-    const lastChunk = chunks.at(-1)
-    const lastChunkLength = lastChunk ? lastChunk.length : 0
-    const values = readPage(reader, columnMetadata, schemaPath, element, dictionary, lastChunk, rowGroupStart - rowCount, options)
-    if (lastChunk === values) {
-      // continued from previous page
-      rowCount += values.length - lastChunkLength
+
+    // read page header
+    const header = parquetHeader(reader)
+    if (header.type === 'DICTIONARY_PAGE') {
+      // assert(!dictionary)
+      dictionary = readPage(reader, header, columnMetadata, schemaPath, element, dictionary, undefined, 0, options)
+      dictionary = convert(dictionary, element, options.utf8)
     } else {
-      chunks.push(values)
-      rowCount += values.length
+      const lastChunk = chunks.at(-1)
+      const lastChunkLength = lastChunk?.length || 0
+      const values = readPage(reader, header, columnMetadata, schemaPath, element, dictionary, lastChunk, rowGroupStart - rowCount, options)
+      if (lastChunk === values) {
+        // continued from previous page
+        rowCount += values.length - lastChunkLength
+      } else {
+        chunks.push(values)
+        rowCount += values.length
+      }
     }
   }
   if (isFinite(rowGroupEnd)) {
@@ -60,6 +64,7 @@ export function readColumn(reader, rowGroupStart, rowGroupEnd, columnMetadata, s
  * Read a page (data or dictionary) from a buffer.
  *
  * @param {DataReader} reader
+ * @param {PageHeader} header
  * @param {ColumnMetaData} columnMetadata
  * @param {SchemaTree[]} schemaPath
  * @param {SchemaElement} element
@@ -69,9 +74,7 @@ export function readColumn(reader, rowGroupStart, rowGroupEnd, columnMetadata, s
  * @param {ParquetReadOptions} options
  * @returns {DecodedArray}
  */
-export function readPage(reader, columnMetadata, schemaPath, element, dictionary, previousChunk, pageStart, { utf8, compressors }) {
-  const header = parquetHeader(reader) // column header
-
+export function readPage(reader, header, columnMetadata, schemaPath, element, dictionary, previousChunk, pageStart, { utf8, compressors }) {
   // read compressed_page_size bytes
   const compressedBytes = new Uint8Array(
     reader.view.buffer, reader.view.byteOffset + reader.offset, header.compressed_page_size
@@ -138,14 +141,6 @@ export function readPage(reader, columnMetadata, schemaPath, element, dictionary
   }
 }
 
-/**
- * @param {ColumnMetaData} columnMetadata
- * @returns {boolean}
- */
-function hasDictionary(columnMetadata) {
-  return columnMetadata.encodings.some(e => e.endsWith('_DICTIONARY'))
-}
-
 /**
  * Find the start byte offset for a column chunk.
  *
diff --git a/src/convert.js b/src/convert.js
index 3db468e..262c2ea 100644
--- a/src/convert.js
+++ b/src/convert.js
@@ -13,8 +13,6 @@ const dayMillis = 86400000 // 1 day in milliseconds
  */
 export function convertWithDictionary(data, dictionary, schemaElement, encoding, utf8 = true) {
   if (dictionary && encoding.endsWith('_DICTIONARY')) {
-    // convert dictionary
-    dictionary = convert(dictionary, schemaElement, utf8)
     let output = data
     if (data instanceof Uint8Array && !(dictionary instanceof Uint8Array)) {
       // @ts-expect-error upgrade data to match dictionary type with fancy constructor
diff --git a/test/files/issue72.json b/test/files/issue72.json
new file mode 100644
index 0000000..1993859
--- /dev/null
+++ b/test/files/issue72.json
@@ -0,0 +1,5 @@
+[
+  ["258d7fff-6418-499f-af07-c6611937d7d8"],
+  ["086f2968-327b-48a8-8cdf-64f46bcd8173"],
+  ["258d7fff-6418-499f-af07-c6611937d7d8"]
+]
diff --git a/test/files/issue72.metadata.json b/test/files/issue72.metadata.json
new file mode 100644
index 0000000..32923a3
--- /dev/null
+++ b/test/files/issue72.metadata.json
@@ -0,0 +1,56 @@
+{
+  "version": 1,
+  "schema": [
+    {
+      "name": "root",
+      "num_children": 1
+    },
+    {
+      "type": "BYTE_ARRAY",
+      "repetition_type": "OPTIONAL",
+      "name": "TextColumn",
+      "converted_type": "UTF8",
+      "logical_type": {
+        "type": "STRING"
+      }
+    }
+  ],
+  "num_rows": 3,
+  "row_groups": [
+    {
+      "columns": [
+        {
+          "file_offset": 4,
+          "meta_data": {
+            "type": "BYTE_ARRAY",
+            "encodings": [
+              "RLE",
+              "BIT_PACKED",
+              "PLAIN"
+            ],
+            "path_in_schema": [
+              "TextColumn"
+            ],
+            "codec": "SNAPPY",
+            "num_values": 3,
+            "total_uncompressed_size": 283,
+            "total_compressed_size": 288,
+            "data_page_offset": 4,
+            "statistics": {
+              "max": "258d7fff-6418-499f-af07-c6611937d7d8",
+              "min": "086f2968-327b-48a8-8cdf-64f46bcd8173",
+              "null_count": 0,
+              "distinct_count": 2,
+              "max_value": "258d7fff-6418-499f-af07-c6611937d7d8",
+              "min_value": "086f2968-327b-48a8-8cdf-64f46bcd8173"
+            }
+          }
+        }
+      ],
+      "total_byte_size": 288,
+      "num_rows": 3
+    }
+  ],
+  "created_by": "Parquet.Net version 4.25.0 (build 687fbb462e94eddd1dc5a0aa26f33ba8e53f60e3)",
+  "metadata_length": 321
+}
diff --git a/test/files/issue72.parquet b/test/files/issue72.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..31d1a30d3e2c72bfb8f4d4cef8a2a3d015c539d6
GIT binary patch
literal 621
zcmcIiK}*9h7)?V7L*_AIl_EWKW5QC}CQVZCGQ0>5E#e<Yn*;`P#o38J$CE$7vp>n7
zV0QDk!@PWaAH4Uz_uw_l#{!4=Bfy`*$}b@R&Cf#$04OVEq;rl6UMR}z+EMLtNjo8m
zqGr-aqYk)S30KxaQC7;9a-|hjojK4P7j0)$AsGhv0$(mL@*faJ@gzM)eUcudO#lf`
zsplh|=g?0SKnP#}0fy1yHr)07fc-I^!pYf_eIIW6&Hi~i1*_PD35wz<o{eHWjw&<)
z(O(RP1d*?he225q^(npnLw0YHATYmc^zLPEhwR=C$(!B1uKR7mv$D$aWZv%A8<Pkn
kUE6X|+L~KqOkui8=USJ-G1h8jE9OLQ*%ep@zE!?xzj~sL(EtDd

literal 0
HcmV?d00001