From a42cc558d00f78a06b9f1b25d1f47e4e8a373dfe Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Mon, 29 Apr 2024 14:22:07 -0700 Subject: [PATCH] Adjust read coalesce size --- package.json | 2 +- src/column.js | 6 ++++-- src/read.js | 8 ++------ 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/package.json b/package.json index 987de90..01b00fc 100644 --- a/package.json +++ b/package.json @@ -28,7 +28,7 @@ }, "devDependencies": { "@types/node": "20.12.7", - "@typescript-eslint/eslint-plugin": "7.7.1", + "@typescript-eslint/eslint-plugin": "7.8.0", "@vitest/coverage-v8": "1.5.2", "eslint": "8.57.0", "eslint-plugin-import": "2.29.1", diff --git a/src/column.js b/src/column.js index 4dcf98d..2747736 100644 --- a/src/column.js +++ b/src/column.js @@ -66,11 +66,13 @@ export function readColumn(arrayBuffer, columnOffset, rowGroup, columnMetadata, if (repetitionLevels.length) { dereferenceDictionary(dictionary, dataPage) // Use repetition levels to construct lists - const isNull = columnMetadata && !isRequired(schema, [columnMetadata.path_in_schema[0]]) + const isNullable = columnMetadata && !isRequired(schema, [columnMetadata.path_in_schema[0]]) const maxDefinitionLevel = getMaxDefinitionLevel(schema, columnMetadata.path_in_schema) const maxRepetitionLevel = getMaxRepetitionLevel(schema, columnMetadata.path_in_schema) + // convert primitive types to rich types + values = convert(dataPage, schemaElement) values = assembleObjects( - definitionLevels, repetitionLevels, dataPage, isNull, maxDefinitionLevel, maxRepetitionLevel + definitionLevels, repetitionLevels, values, isNullable, maxDefinitionLevel, maxRepetitionLevel ) } else if (definitionLevels?.length) { const maxDefinitionLevel = getMaxDefinitionLevel(schema, columnMetadata.path_in_schema) diff --git a/src/read.js b/src/read.js index debde57..b839158 100644 --- a/src/read.js +++ b/src/read.js @@ -101,9 +101,9 @@ async function readRowGroup(options, rowGroup, groupStart) { // TODO: should throw if any column is missing throw new Error(`parquet columns not found: ${columns.join(', ')}`) } - // if row group size is less than 128mb, pre-load in one read + // if row group size is less than 32mb, pre-load in one read let groupBuffer - if (groupEndByte - groupStartByte <= 1 << 27) { + if (groupEndByte - groupStartByte <= 1 << 25) { // pre-load row group byte data in one big read, // otherwise read column data individually groupBuffer = await file.slice(groupStartByte, groupEndByte) @@ -186,10 +186,6 @@ async function readRowGroup(options, rowGroup, groupStart) { keys[i][j] = keys[i][j][0] values[i][j] = values[i][j][0] } - if (keys[i][j] instanceof Uint8Array) { - // decode utf-8 keys - keys[i][j] = new TextDecoder().decode(keys[i][j]) - } if (!keys[i][j]) continue obj[keys[i][j]] = values[i][j] === undefined ? null : values[i][j] }