Adjust read coalesce size

2026-02-20 11:31:31 +00:00 · 2024-04-29 14:22:07 -07:00 · 2024-04-29 14:22:07 -07:00 · a42cc558d0
commit a42cc558d0
parent d6a1981bcc
3 changed files with 7 additions and 9 deletions
--- a/package.json
+++ b/package.json
@ -28,7 +28,7 @@
  },
  "devDependencies": {
    "@types/node": "20.12.7",
-    "@typescript-eslint/eslint-plugin": "7.7.1",
+    "@typescript-eslint/eslint-plugin": "7.8.0",
    "@vitest/coverage-v8": "1.5.2",
    "eslint": "8.57.0",
    "eslint-plugin-import": "2.29.1",
--- a/src/column.js
+++ b/src/column.js
@ -66,11 +66,13 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata,
      if (repetitionLevels.length) {
        dereferenceDictionary(dictionary, dataPage)
        // Use repetition levels to construct lists
-        const isNull = columnMetadata && !isRequired(schema, [columnMetadata.path_in_schema[0]])
+        const isNullable = columnMetadata && !isRequired(schema, [columnMetadata.path_in_schema[0]])
        const maxDefinitionLevel = getMaxDefinitionLevel(schema, columnMetadata.path_in_schema)
        const maxRepetitionLevel = getMaxRepetitionLevel(schema, columnMetadata.path_in_schema)
+        // convert primitive types to rich types
+        values = convert(dataPage, schemaElement)
        values = assembleObjects(
-          definitionLevels, repetitionLevels, dataPage, isNull, maxDefinitionLevel, maxRepetitionLevel
+          definitionLevels, repetitionLevels, values, isNullable, maxDefinitionLevel, maxRepetitionLevel
        )
      } else if (definitionLevels?.length) {
        const maxDefinitionLevel = getMaxDefinitionLevel(schema, columnMetadata.path_in_schema)
--- a/src/read.js
+++ b/src/read.js
@ -101,9 +101,9 @@ async function readRowGroup(options, rowGroup, groupStart) {
    // TODO: should throw if any column is missing
    throw new Error(`parquet columns not found: ${columns.join(', ')}`)
  }
-  // if row group size is less than 128mb, pre-load in one read
+  // if row group size is less than 32mb, pre-load in one read
  let groupBuffer
-  if (groupEndByte - groupStartByte <= 1 << 27) {
+  if (groupEndByte - groupStartByte <= 1 << 25) {
    // pre-load row group byte data in one big read,
    // otherwise read column data individually
    groupBuffer = await file.slice(groupStartByte, groupEndByte)
@ -186,10 +186,6 @@ async function readRowGroup(options, rowGroup, groupStart) {
                    keys[i][j] = keys[i][j][0]
                    values[i][j] = values[i][j][0]
                  }
-                  if (keys[i][j] instanceof Uint8Array) {
-                    // decode utf-8 keys
-                    keys[i][j] = new TextDecoder().decode(keys[i][j])
-                  }
                  if (!keys[i][j]) continue
                  obj[keys[i][j]] = values[i][j] === undefined ? null : values[i][j]
                }