From f37b2aea9fdccd0b2bdad4c73b3075d0801c7234 Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Mon, 17 Mar 2025 10:07:08 -0700 Subject: [PATCH] for is faster than forEach --- README.md | 6 ++++-- benchmark.js | 4 ++-- package.json | 10 +++++----- src/read.js | 33 ++++++++++++++------------------- src/thrift.js | 4 ++-- 5 files changed, 27 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index e4bf394..fcee8d3 100644 --- a/README.md +++ b/README.md @@ -165,8 +165,8 @@ const file = await asyncBufferFromUrl({ url, requestInit }) ### Returned row format -By default, data returned in the `onComplete` function will be one array of columns per row. -If you would like each row to be an object with each key the name of the column, set the option `rowFormat` to `object`. +By default, data returned by `parquetRead` in the `onComplete` function will be one **array** of columns per row. +If you would like each row to be an **object** with each key the name of the column, set the option `rowFormat` to `object`. ```javascript import { parquetRead } from 'hyparquet' @@ -178,6 +178,8 @@ await parquetRead({ }) ``` +The `parquetReadObjects` function defaults to returning an array of objects. + ## Supported Parquet Files The parquet format is known to be a sprawling format which includes options for a wide array of compression schemes, encoding types, and data structures. diff --git a/benchmark.js b/benchmark.js index ca1a22c..0a06956 100644 --- a/benchmark.js +++ b/benchmark.js @@ -1,7 +1,7 @@ import { createWriteStream, promises as fs } from 'fs' import { compressors } from 'hyparquet-compressors' import { pipeline } from 'stream/promises' -import { parquetRead } from './src/hyparquet.js' +import { parquetReadObjects } from './src/hyparquet.js' import { asyncBufferFromFile } from './src/utils.js' const url = 'https://huggingface.co/datasets/wikimedia/wikipedia/resolve/main/20231101.en/train-00000-of-00041.parquet' @@ -25,7 +25,7 @@ const startTime = performance.now() console.log('parsing example.parquet data...') // read parquet file -await parquetRead({ +await parquetReadObjects({ file, compressors, }) diff --git a/package.json b/package.json index f738a8b..6cf47ab 100644 --- a/package.json +++ b/package.json @@ -19,8 +19,8 @@ }, "main": "src/hyparquet.js", "files": [ - "types", - "src" + "src", + "types" ], "type": "module", "types": "types/hyparquet.d.ts", @@ -34,12 +34,12 @@ }, "devDependencies": { "@types/node": "22.13.10", - "@vitest/coverage-v8": "3.0.8", + "@vitest/coverage-v8": "3.0.9", "eslint": "9.22.0", - "eslint-plugin-jsdoc": "50.6.6", + "eslint-plugin-jsdoc": "50.6.8", "hyparquet-compressors": "1.0.0", "typescript": "5.8.2", "typescript-eslint": "8.26.1", - "vitest": "3.0.8" + "vitest": "3.0.9" } } diff --git a/src/read.js b/src/read.js index 8d2fb39..a3bf5c0 100644 --- a/src/read.js +++ b/src/read.js @@ -71,17 +71,16 @@ export async function readRowGroup(options, rowGroup, groupStart, rowLimit) { // loop through metadata to find min/max bytes to read let [groupStartByte, groupEndByte] = [file.byteLength, 0] - rowGroup.columns.forEach(({ meta_data: columnMetadata }) => { - if (!columnMetadata) throw new Error('parquet column metadata is undefined') + for (const { meta_data } of rowGroup.columns) { + if (!meta_data) throw new Error('parquet column metadata is undefined') // skip columns that are not requested - if (columns && !columns.includes(columnMetadata.path_in_schema[0])) return + if (columns && !columns.includes(meta_data.path_in_schema[0])) continue - const [columnStartByte, columnEndByte] = getColumnRange(columnMetadata).map(Number) + const [columnStartByte, columnEndByte] = getColumnRange(meta_data).map(Number) groupStartByte = Math.min(groupStartByte, columnStartByte) groupEndByte = Math.max(groupEndByte, columnEndByte) - }) + } if (groupStartByte >= groupEndByte && columns?.length) { - // TODO: should throw if any column is missing throw new Error(`parquet columns not found: ${columns.join(', ')}`) } // if row group size is less than 32mb, pre-load in one read @@ -148,16 +147,12 @@ export async function readRowGroup(options, rowGroup, groupStart, rowLimit) { if (subcolumns?.every(name => subcolumnData.has(name))) { // For every subcolumn, flatten and assemble the column const flatData = new Map(subcolumns.map(name => [name, flatten(subcolumnData.get(name))])) - // We have all data needed to assemble a top level column assembleNested(flatData, schemaPath[1]) const flatColumn = flatData.get(columnName) - if (flatColumn) { - chunks = [flatColumn] - subcolumns.forEach(name => subcolumnData.delete(name)) - subcolumnData.set(columnName, chunks) - } else { - throw new Error(`parquet column data not assembled: ${columnName}`) - } + if (!flatColumn) throw new Error(`parquet column data not assembled: ${columnName}`) + chunks = [flatColumn] + subcolumns.forEach(name => subcolumnData.delete(name)) + subcolumnData.set(columnName, chunks) } // do not emit column data until structs are fully parsed @@ -175,8 +170,6 @@ export async function readRowGroup(options, rowGroup, groupStart, rowLimit) { } await Promise.all(promises) if (options.onComplete) { - // transpose columns into rows - const groupData = new Array(rowLimit) const includedColumnNames = children .map(child => child.element.name) .filter(name => !columns || columns.includes(name)) @@ -184,14 +177,16 @@ export async function readRowGroup(options, rowGroup, groupStart, rowLimit) { const includedColumns = columnOrder .map(name => includedColumnNames.includes(name) ? flatten(subcolumnData.get(name)) : undefined) + // transpose columns into rows + const groupData = new Array(rowLimit) for (let row = 0; row < rowLimit; row++) { if (options.rowFormat === 'object') { // return each row as an object /** @type {Record} */ const rowData = {} - columnOrder.forEach((name, index) => { - rowData[name] = includedColumns[index]?.[row] - }) + for (let i = 0; i < columnOrder.length; i++) { + rowData[columnOrder[i]] = includedColumns[i]?.[row] + } groupData[row] = rowData } else { // return each row as an array diff --git a/src/thrift.js b/src/thrift.js index 819eecc..5720c81 100644 --- a/src/thrift.js +++ b/src/thrift.js @@ -160,7 +160,7 @@ function readVarBigInt(reader) { * @param {DataReader} reader * @returns {number} value */ -function readZigZag(reader) { +export function readZigZag(reader) { const zigzag = readVarInt(reader) // convert zigzag to int return zigzag >>> 1 ^ -(zigzag & 1) @@ -176,7 +176,7 @@ function readZigZag(reader) { export function readZigZagBigInt(reader) { const zigzag = readVarBigInt(reader) // convert zigzag to int - return zigzag >> BigInt(1) ^ -(zigzag & BigInt(1)) + return zigzag >> 1n ^ -(zigzag & 1n) } /**