Upgrade dataPage to match dictionary type

2025-12-05 22:41:55 +00:00 · 2024-05-22 23:45:02 -07:00 · 2024-05-22 23:45:02 -07:00 · b8e4496063
commit b8e4496063
parent c4ad05e580
11 changed files with 1164 additions and 123 deletions
--- a/src/column.js
+++ b/src/column.js
@ -1,5 +1,5 @@
 import { assembleLists } from './assemble.js'
-import { convert } from './convert.js'
+import { convert, dereferenceDictionary } from './convert.js'
 import { readDataPage, readDictionaryPage } from './datapage.js'
 import { readDataPageV2 } from './datapageV2.js'
 import { parquetHeader } from './header.js'
@ -11,6 +11,7 @@ import { concat } from './utils.js'
 * Parse column data from a buffer.
 *
 * @typedef {import('./types.js').ColumnMetaData} ColumnMetaData
+ * @typedef {import('./types.js').DecodedArray} DecodedArray
 * @param {import('./types.js').DataReader} reader
 * @param {import('./types.js').RowGroup} rowGroup row group metadata
 * @param {ColumnMetaData} columnMetadata column metadata
@ -20,7 +21,7 @@ import { concat } from './utils.js'
 */
 export function readColumn(reader, rowGroup, columnMetadata, schemaPath, { compressors, utf8 }) {
  const { element } = schemaPath[schemaPath.length - 1]
-  /** @type {ArrayLike<any> | undefined} */
+  /** @type {DecodedArray | undefined} */
  let dictionary = undefined
  let seen = 0
  /** @type {any[]} */
@ -49,8 +50,8 @@ export function readColumn(reader, rowGroup, columnMetadata, schemaPath, { compr
      // assert(!daph.statistics || daph.statistics.null_count === BigInt(daph.num_values - dataPage.length))

      // construct output values: skip nulls and construct lists
-      dereferenceDictionary(dictionary, dataPage)
-      values = convert(dataPage, element, utf8)
+      values = dereferenceDictionary(dictionary, dataPage)
+      values = convert(values, element, utf8)
      if (repetitionLevels.length || definitionLevels?.length) {
        // Use repetition levels to construct lists
        const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath)
@ -78,8 +79,8 @@ export function readColumn(reader, rowGroup, columnMetadata, schemaPath, { compr
      )
      seen += daph2.num_values

-      dereferenceDictionary(dictionary, dataPage)
-      values = convert(dataPage, element, utf8)
+      values = dereferenceDictionary(dictionary, dataPage)
+      values = convert(values, element, utf8)
      if (repetitionLevels.length || definitionLevels?.length) {
        // Use repetition levels to construct lists
        const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath)
@ -109,21 +110,6 @@ export function readColumn(reader, rowGroup, columnMetadata, schemaPath, { compr
  return rowData
 }

-/**
- * Map data to dictionary values in place.
- *
- * @typedef {import('./types.js').DecodedArray} DecodedArray
- * @param {ArrayLike<any> | undefined} dictionary
- * @param {DecodedArray} dataPage
- */
-function dereferenceDictionary(dictionary, dataPage) {
-  if (dictionary) {
-    for (let i = 0; i < dataPage.length; i++) {
-      dataPage[i] = dictionary[dataPage[i]]
-    }
-  }
-}
-
 /**
 * Find the start byte offset for a column chunk.
 *
--- a/src/convert.js
+++ b/src/convert.js
@ -97,3 +97,25 @@ export function parseFloat16(bytes) {
  if (exp === 0x1f) return frac ? NaN : sign * Infinity
  return sign * Math.pow(2, exp - 15) * (1 + frac / 1024)
 }
+
+/**
+ * Map data to dictionary values in place.
+ *
+ * @param {DecodedArray | undefined} dictionary
+ * @param {DecodedArray} dataPage
+ * @returns {DecodedArray}
+ */
+export function dereferenceDictionary(dictionary, dataPage) {
+  let output = dataPage
+  if (dictionary) {
+    if (dataPage instanceof Uint8Array && !(dictionary instanceof Uint8Array)) {
+      // upgrade dataPage to match dictionary type
+      // @ts-expect-error not my fault typescript doesn't understand constructors
+      output = new dictionary.constructor(dataPage.length)
+    }
+    for (let i = 0; i < dataPage.length; i++) {
+      output[i] = dictionary[dataPage[i]]
+    }
+  }
+  return output
+}
--- a/src/datapage.js
+++ b/src/datapage.js
@ -54,13 +54,11 @@ export function readDataPage(bytes, daph, schemaPath, { type }) {
 }

 /**
- * Read a page containing dictionary data.
- *
 * @param {Uint8Array} bytes raw page data
 * @param {import("./types.d.ts").DictionaryPageHeader} diph dictionary page header
 * @param {ColumnMetaData} columnMetadata
 * @param {number | undefined} typeLength - type_length from schema
- * @returns {ArrayLike<any>} array of values
+ * @returns {DecodedArray}
 */
 export function readDictionaryPage(bytes, diph, columnMetadata, typeLength) {
  const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength)
--- a/test/convert.test.js
+++ b/test/convert.test.js
@ -111,7 +111,8 @@ describe('convert function', () => {
 })

 describe('parseFloat16', () => {
-  it('should convert numbers', () => {
+  it('convert float16 numbers', () => {
+    expect(parseFloat16(undefined)).toBe(undefined)
    expect(parseFloat16(new Uint8Array([0x00, 0xbc]))).toBe(-1)
    expect(parseFloat16(new Uint8Array([0x00, 0x00]))).toBe(0)
    expect(parseFloat16(new Uint8Array([0x00, 0x38]))).toBe(0.5)
@ -119,22 +120,22 @@ describe('parseFloat16', () => {
    expect(parseFloat16(new Uint8Array([0x00, 0x40]))).toBe(2)
  })

-  it('should convert -0', () => {
+  it('convert float16 -0', () => {
    expect(parseFloat16(new Uint8Array([0x00, 0x80]))).toBe(-0)
    expect(parseFloat16(new Uint8Array([0x00, 0x80]))).not.toBe(0)
  })

-  it('should convert Infinity', () => {
+  it('convert float16 Infinity', () => {
    expect(parseFloat16(new Uint8Array([0x00, 0x7c]))).toBe(Infinity)
    expect(parseFloat16(new Uint8Array([0x00, 0xfc]))).toBe(-Infinity)
  })

-  it('should convert NaN', () => {
+  it('convert float16 NaN', () => {
    expect(parseFloat16(new Uint8Array([0x00, 0x7e]))).toBeNaN()
    expect(parseFloat16(new Uint8Array([0x01, 0x7e]))).toBeNaN()
  })

-  it('should convert a subnormal number', () => {
+  it('convert float16 subnormal number', () => {
    expect(parseFloat16(new Uint8Array([0xff, 0x03])))
      .toBeCloseTo(Math.pow(2, -14) * (1023 / 1024), 5)
  })
--- a/test/files/addrtype-missing-value.json
+++ b/test/files/addrtype-missing-value.json
@ -1,12 +0,0 @@
-[
-  [ "Block" ],
-  [ "Intersection" ],
-  [ "Block" ],
-  [ "Block" ],
-  [ null ],
-  [ "Block" ],
-  [ "Intersection" ],
-  [ "Block" ],
-  [ "Block" ],
-  [ "Intersection" ]
-]
--- a/test/files/addrtype-missing-value.metadata.json
+++ b/test/files/addrtype-missing-value.metadata.json
@ -1,50 +0,0 @@
-{
-  "version": 1,
-  "created_by": "DuckDB",
-  "metadata_length": 149,
-  "schema": [
-    {
-      "repetition_type": "REQUIRED",
-      "name": "duckdb_schema",
-      "num_children": 1
-    },
-    {
-      "type": "BYTE_ARRAY",
-      "repetition_type": "OPTIONAL",
-      "name": "ADDRTYPE",
-      "converted_type": "UTF8"
-    }
-  ],
-  "num_rows": 10,
-  "row_groups": [
-    {
-      "columns": [
-        {
-          "file_offset": 0,
-          "meta_data": {
-            "type": "BYTE_ARRAY",
-            "encodings": ["PLAIN", "RLE_DICTIONARY"],
-            "path_in_schema": ["ADDRTYPE"],
-            "codec": "SNAPPY",
-            "num_values": 10,
-            "total_uncompressed_size": 78,
-            "total_compressed_size": 82,
-            "data_page_offset": 31,
-            "dictionary_page_offset": 4,
-            "statistics": {
-              "max": "Intersection",
-              "min": "Block",
-              "max_value": "Intersection",
-              "min_value": "Block",
-              "null_count": 1,
-              "distinct_count": 2
-            }
-          }
-        }
-      ],
-      "file_offset": 4,
-      "total_byte_size": 33024,
-      "num_rows": 10
-    }
-  ]
-}
--- a/test/files/addrtype-missing-value.parquet
+++ b/test/files/addrtype-missing-value.parquet
--- a/test/files/plain-dict-uncompressed-checksum.json
+++ b/test/files/plain-dict-uncompressed-checksum.json
--- a/test/files/plain-dict-uncompressed-checksum.metadata.json
+++ b/test/files/plain-dict-uncompressed-checksum.metadata.json
@ -0,0 +1,125 @@
+{
+  "version": 1,
+  "schema": [
+    {
+      "name": "m",
+      "num_children": 2
+    },
+    {
+      "type": "INT64",
+      "repetition_type": "REQUIRED",
+      "name": "long_field"
+    },
+    {
+      "type": "BYTE_ARRAY",
+      "repetition_type": "REQUIRED",
+      "name": "binary_field"
+    }
+  ],
+  "num_rows": 1000,
+  "row_groups": [
+    {
+      "columns": [
+        {
+          "file_offset": 31,
+          "meta_data": {
+            "type": "INT64",
+            "encodings": [
+              "PLAIN_DICTIONARY",
+              "BIT_PACKED"
+            ],
+            "path_in_schema": [
+              "long_field"
+            ],
+            "codec": "UNCOMPRESSED",
+            "num_values": 1000,
+            "total_uncompressed_size": 54,
+            "total_compressed_size": 54,
+            "data_page_offset": 31,
+            "dictionary_page_offset": 4,
+            "statistics": {
+              "max": 0,
+              "min": 0,
+              "null_count": 0,
+              "max_value": 0,
+              "min_value": 0
+            },
+            "encoding_stats": [
+              {
+                "page_type": 2,
+                "encoding": "PLAIN_DICTIONARY",
+                "count": 1
+              },
+              {
+                "page_type": 0,
+                "encoding": "PLAIN_DICTIONARY",
+                "count": 1
+              }
+            ]
+          },
+          "offset_index_offset": 262,
+          "offset_index_length": 10,
+          "column_index_offset": 144,
+          "column_index_length": 31,
+          "crypto_metadata": 31
+        },
+        {
+          "file_offset": 117,
+          "meta_data": {
+            "type": "BYTE_ARRAY",
+            "encodings": [
+              "PLAIN_DICTIONARY",
+              "BIT_PACKED"
+            ],
+            "path_in_schema": [
+              "binary_field"
+            ],
+            "codec": "UNCOMPRESSED",
+            "num_values": 1000,
+            "total_uncompressed_size": 86,
+            "total_compressed_size": 86,
+            "data_page_offset": 117,
+            "dictionary_page_offset": 58,
+            "statistics": {
+              "max": "a655fd0e-9949-4059-bcae-fd6a002a4652",
+              "min": "a655fd0e-9949-4059-bcae-fd6a002a4652",
+              "null_count": 0,
+              "max_value": "a655fd0e-9949-4059-bcae-fd6a002a4652",
+              "min_value": "a655fd0e-9949-4059-bcae-fd6a002a4652"
+            },
+            "encoding_stats": [
+              {
+                "page_type": 2,
+                "encoding": "PLAIN_DICTIONARY",
+                "count": 1
+              },
+              {
+                "page_type": 0,
+                "encoding": "PLAIN_DICTIONARY",
+                "count": 1
+              }
+            ]
+          },
+          "offset_index_offset": 272,
+          "offset_index_length": 11,
+          "column_index_offset": 175,
+          "column_index_length": 87,
+          "crypto_metadata": 87
+        }
+      ],
+      "total_byte_size": 140,
+      "num_rows": 1000,
+      "file_offset": 4,
+      "total_compressed_size": 140,
+      "ordinal": 0
+    }
+  ],
+  "key_value_metadata": [
+    {
+      "key": "writer.model.name",
+      "value": "example"
+    }
+  ],
+  "created_by": "parquet-mr version 1.13.0-SNAPSHOT (build 261f7d2679407c833545b56f4c85a4ae8b5c9ed4)",
+  "metadata_length": 525
+}
--- a/test/files/plain-dict-uncompressed-checksum.parquet
+++ b/test/files/plain-dict-uncompressed-checksum.parquet
--- a/test/schemaTree.test.js
+++ b/test/schemaTree.test.js
@ -2,14 +2,7 @@ import { describe, expect, it } from 'vitest'
 import { parquetMetadata, parquetSchema } from '../src/hyparquet.js'
 import { readFileToArrayBuffer } from './helpers.js'

-describe('schemaTree', () => {
-  it('parse schema tree from addrtype-missing-value.parquet', async () => {
-    const arrayBuffer = await readFileToArrayBuffer('test/files/addrtype-missing-value.parquet')
-    const metadata = parquetMetadata(arrayBuffer)
-    const result = parquetSchema(metadata)
-    expect(result).toEqual(addrtypeSchema)
-  })
-
+describe('parquetSchema', () => {
  it('parse schema tree from rowgroups.parquet', async () => {
    const arrayBuffer = await readFileToArrayBuffer('test/files/rowgroups.parquet')
    const metadata = parquetMetadata(arrayBuffer)
@ -18,30 +11,6 @@ describe('schemaTree', () => {
  })
 })

-// Parquet v1 from DuckDB
-const addrtypeSchema = {
-  children: [
-    {
-      children: [],
-      count: 1,
-      element: {
-        converted_type: 'UTF8',
-        name: 'ADDRTYPE',
-        repetition_type: 'OPTIONAL',
-        type: 'BYTE_ARRAY',
-      },
-      path: ['ADDRTYPE'],
-    },
-  ],
-  count: 2,
-  element: {
-    name: 'duckdb_schema',
-    num_children: 1,
-    repetition_type: 'REQUIRED',
-  },
-  path: [],
-}
-
 // Parquet v2 from pandas with 2 row groups
 const rowgroupsSchema = {
  children: [