Convert logical strings

2026-02-21 20:01:33 +00:00 · 2025-05-15 23:44:09 -07:00 · 2025-05-15 23:44:09 -07:00 · 8dbb74ac78
commit 8dbb74ac78
parent b635904239
5 changed files with 154 additions and 4 deletions
--- a/src/convert.js
+++ b/src/convert.js
@ -7,10 +7,10 @@ const dayMillis = 86400000 // 1 day in milliseconds
 * @param {DecodedArray | undefined} dictionary
 * @param {SchemaElement} schemaElement
 * @param {Encoding} encoding
- * @param {boolean | undefined} utf8 decode bytes as utf8?
+ * @param {boolean} [utf8] decode bytes as utf8?
 * @returns {DecodedArray} series of rich types
 */
-export function convertWithDictionary(data, dictionary, schemaElement, encoding, utf8 = true) {
+export function convertWithDictionary(data, dictionary, schemaElement, encoding, utf8) {
  if (dictionary && encoding.endsWith('_DICTIONARY')) {
    let output = data
    if (data instanceof Uint8Array && !(dictionary instanceof Uint8Array)) {
@ -31,7 +31,7 @@ export function convertWithDictionary(data, dictionary, schemaElement, encoding,
 *
 * @param {DecodedArray} data series of primitive types
 * @param {SchemaElement} schemaElement
- * @param {boolean | undefined} utf8 decode bytes as utf8?
+ * @param {boolean} [utf8] decode bytes as utf8?
 * @returns {DecodedArray} series of rich types
 */
 export function convert(data, schemaElement, utf8 = true) {
@ -83,7 +83,7 @@ export function convert(data, schemaElement, utf8 = true) {
  if (ctype === 'INTERVAL') {
    throw new Error('parquet interval not supported')
  }
-  if (ctype === 'UTF8' || utf8 && type === 'BYTE_ARRAY') {
+  if (ctype === 'UTF8' || ltype?.type === 'STRING' || utf8 && type === 'BYTE_ARRAY') {
    const decoder = new TextDecoder()
    const arr = new Array(data.length)
    for (let i = 0; i < arr.length; i++) {
--- a/test/files/strings.json
+++ b/test/files/strings.json
@ -0,0 +1,6 @@
+[
+  ["alpha", "alpha", "alpha"],
+  ["bravo", "bravo", "bravo"],
+  ["charlie", "charlie", "charlie"],
+  ["delta", "delta", "delta"]
+]
--- a/test/files/strings.metadata.json
+++ b/test/files/strings.metadata.json
@ -0,0 +1,90 @@
+{
+  "version": 2,
+  "schema": [
+    {
+      "name": "root",
+      "num_children": 3
+    },
+    {
+      "type": "BYTE_ARRAY",
+      "name": "bytes"
+    },
+    {
+      "type": "BYTE_ARRAY",
+      "name": "c_utf8",
+      "converted_type": "UTF8"
+    },
+    {
+      "type": "BYTE_ARRAY",
+      "name": "l_utf8",
+      "logical_type": {
+        "type": "STRING"
+      }
+    }
+  ],
+  "num_rows": 4,
+  "row_groups": [
+    {
+      "columns": [
+        {
+          "file_offset": 4,
+          "meta_data": {
+            "type": "BYTE_ARRAY",
+            "encodings": ["PLAIN"],
+            "path_in_schema": ["bytes"],
+            "codec": "UNCOMPRESSED",
+            "num_values": 4,
+            "total_uncompressed_size": 62,
+            "total_compressed_size": 62,
+            "data_page_offset": 4,
+            "statistics": {
+              "null_count": 0,
+              "max_value": "delta",
+              "min_value": "alpha"
+            }
+          }
+        },
+        {
+          "file_offset": 66,
+          "meta_data": {
+            "type": "BYTE_ARRAY",
+            "encodings": ["PLAIN"],
+            "path_in_schema": ["c_utf8"],
+            "codec": "UNCOMPRESSED",
+            "num_values": 4,
+            "total_uncompressed_size": 62,
+            "total_compressed_size": 62,
+            "data_page_offset": 66,
+            "statistics": {
+              "null_count": 0,
+              "max_value": "delta",
+              "min_value": "alpha"
+            }
+          }
+        },
+        {
+          "file_offset": 128,
+          "meta_data": {
+            "type": "BYTE_ARRAY",
+            "encodings": ["PLAIN"],
+            "path_in_schema": ["l_utf8"],
+            "codec": "UNCOMPRESSED",
+            "num_values": 4,
+            "total_uncompressed_size": 62,
+            "total_compressed_size": 62,
+            "data_page_offset": 128,
+            "statistics": {
+              "null_count": 0,
+              "max_value": "delta",
+              "min_value": "alpha"
+            }
+          }
+        }
+      ],
+      "total_byte_size": 186,
+      "num_rows": 4
+    }
+  ],
+  "created_by": "hyparquet",
+  "metadata_length": 219
+}
--- a/test/files/strings.parquet
+++ b/test/files/strings.parquet
--- a/test/read.utf8.test.js
+++ b/test/read.utf8.test.js
@ -0,0 +1,54 @@
+import { describe, expect, it } from 'vitest'
+import { parquetReadObjects } from '../src/hyparquet.js'
+import { asyncBufferFromFile } from '../src/utils.js'
+
+describe('parquetRead utf8', () => {
+  it('default utf8 behavior', async () => {
+    const file = await asyncBufferFromFile('test/files/strings.parquet')
+    const rows = await parquetReadObjects({ file })
+    expect(rows).toEqual([
+      { bytes: 'alpha', c_utf8: 'alpha', l_utf8: 'alpha' },
+      { bytes: 'bravo', c_utf8: 'bravo', l_utf8: 'bravo' },
+      { bytes: 'charlie', c_utf8: 'charlie', l_utf8: 'charlie' },
+      { bytes: 'delta', c_utf8: 'delta', l_utf8: 'delta' },
+    ])
+  })
+
+  it('utf8 = true', async () => {
+    const file = await asyncBufferFromFile('test/files/strings.parquet')
+    const rows = await parquetReadObjects({ file, utf8: true })
+    expect(rows).toEqual([
+      { bytes: 'alpha', c_utf8: 'alpha', l_utf8: 'alpha' },
+      { bytes: 'bravo', c_utf8: 'bravo', l_utf8: 'bravo' },
+      { bytes: 'charlie', c_utf8: 'charlie', l_utf8: 'charlie' },
+      { bytes: 'delta', c_utf8: 'delta', l_utf8: 'delta' },
+    ])
+  })
+
+  it('utf8 = false', async () => {
+    const file = await asyncBufferFromFile('test/files/strings.parquet')
+    const rows = await parquetReadObjects({ file, utf8: false })
+    expect(rows).toEqual([
+      {
+        bytes: new Uint8Array([97, 108, 112, 104, 97]),
+        c_utf8: 'alpha',
+        l_utf8: 'alpha',
+      },
+      {
+        bytes: new Uint8Array([98, 114, 97, 118, 111]),
+        c_utf8: 'bravo',
+        l_utf8: 'bravo',
+      },
+      {
+        bytes: new Uint8Array([99, 104, 97, 114, 108, 105, 101]),
+        c_utf8: 'charlie',
+        l_utf8: 'charlie',
+      },
+      {
+        bytes: new Uint8Array([100, 101, 108, 116, 97]),
+        c_utf8: 'delta',
+        l_utf8: 'delta',
+      },
+    ])
+  })
+})