for is faster than forEach

This commit is contained in:
Kenny Daniel 2025-03-17 10:07:08 -07:00
parent 97aef2adef
commit f37b2aea9f
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
5 changed files with 27 additions and 30 deletions

@ -165,8 +165,8 @@ const file = await asyncBufferFromUrl({ url, requestInit })
### Returned row format
By default, data returned in the `onComplete` function will be one array of columns per row.
If you would like each row to be an object with each key the name of the column, set the option `rowFormat` to `object`.
By default, data returned by `parquetRead` in the `onComplete` function will be one **array** of columns per row.
If you would like each row to be an **object** with each key the name of the column, set the option `rowFormat` to `object`.
```javascript
import { parquetRead } from 'hyparquet'
@ -178,6 +178,8 @@ await parquetRead({
})
```
The `parquetReadObjects` function defaults to returning an array of objects.
## Supported Parquet Files
The parquet format is known to be a sprawling format which includes options for a wide array of compression schemes, encoding types, and data structures.

@ -1,7 +1,7 @@
import { createWriteStream, promises as fs } from 'fs'
import { compressors } from 'hyparquet-compressors'
import { pipeline } from 'stream/promises'
import { parquetRead } from './src/hyparquet.js'
import { parquetReadObjects } from './src/hyparquet.js'
import { asyncBufferFromFile } from './src/utils.js'
const url = 'https://huggingface.co/datasets/wikimedia/wikipedia/resolve/main/20231101.en/train-00000-of-00041.parquet'
@ -25,7 +25,7 @@ const startTime = performance.now()
console.log('parsing example.parquet data...')
// read parquet file
await parquetRead({
await parquetReadObjects({
file,
compressors,
})

@ -19,8 +19,8 @@
},
"main": "src/hyparquet.js",
"files": [
"types",
"src"
"src",
"types"
],
"type": "module",
"types": "types/hyparquet.d.ts",
@ -34,12 +34,12 @@
},
"devDependencies": {
"@types/node": "22.13.10",
"@vitest/coverage-v8": "3.0.8",
"@vitest/coverage-v8": "3.0.9",
"eslint": "9.22.0",
"eslint-plugin-jsdoc": "50.6.6",
"eslint-plugin-jsdoc": "50.6.8",
"hyparquet-compressors": "1.0.0",
"typescript": "5.8.2",
"typescript-eslint": "8.26.1",
"vitest": "3.0.8"
"vitest": "3.0.9"
}
}

@ -71,17 +71,16 @@ export async function readRowGroup(options, rowGroup, groupStart, rowLimit) {
// loop through metadata to find min/max bytes to read
let [groupStartByte, groupEndByte] = [file.byteLength, 0]
rowGroup.columns.forEach(({ meta_data: columnMetadata }) => {
if (!columnMetadata) throw new Error('parquet column metadata is undefined')
for (const { meta_data } of rowGroup.columns) {
if (!meta_data) throw new Error('parquet column metadata is undefined')
// skip columns that are not requested
if (columns && !columns.includes(columnMetadata.path_in_schema[0])) return
if (columns && !columns.includes(meta_data.path_in_schema[0])) continue
const [columnStartByte, columnEndByte] = getColumnRange(columnMetadata).map(Number)
const [columnStartByte, columnEndByte] = getColumnRange(meta_data).map(Number)
groupStartByte = Math.min(groupStartByte, columnStartByte)
groupEndByte = Math.max(groupEndByte, columnEndByte)
})
}
if (groupStartByte >= groupEndByte && columns?.length) {
// TODO: should throw if any column is missing
throw new Error(`parquet columns not found: ${columns.join(', ')}`)
}
// if row group size is less than 32mb, pre-load in one read
@ -148,16 +147,12 @@ export async function readRowGroup(options, rowGroup, groupStart, rowLimit) {
if (subcolumns?.every(name => subcolumnData.has(name))) {
// For every subcolumn, flatten and assemble the column
const flatData = new Map(subcolumns.map(name => [name, flatten(subcolumnData.get(name))]))
// We have all data needed to assemble a top level column
assembleNested(flatData, schemaPath[1])
const flatColumn = flatData.get(columnName)
if (flatColumn) {
chunks = [flatColumn]
subcolumns.forEach(name => subcolumnData.delete(name))
subcolumnData.set(columnName, chunks)
} else {
throw new Error(`parquet column data not assembled: ${columnName}`)
}
if (!flatColumn) throw new Error(`parquet column data not assembled: ${columnName}`)
chunks = [flatColumn]
subcolumns.forEach(name => subcolumnData.delete(name))
subcolumnData.set(columnName, chunks)
}
// do not emit column data until structs are fully parsed
@ -175,8 +170,6 @@ export async function readRowGroup(options, rowGroup, groupStart, rowLimit) {
}
await Promise.all(promises)
if (options.onComplete) {
// transpose columns into rows
const groupData = new Array(rowLimit)
const includedColumnNames = children
.map(child => child.element.name)
.filter(name => !columns || columns.includes(name))
@ -184,14 +177,16 @@ export async function readRowGroup(options, rowGroup, groupStart, rowLimit) {
const includedColumns = columnOrder
.map(name => includedColumnNames.includes(name) ? flatten(subcolumnData.get(name)) : undefined)
// transpose columns into rows
const groupData = new Array(rowLimit)
for (let row = 0; row < rowLimit; row++) {
if (options.rowFormat === 'object') {
// return each row as an object
/** @type {Record<string, any>} */
const rowData = {}
columnOrder.forEach((name, index) => {
rowData[name] = includedColumns[index]?.[row]
})
for (let i = 0; i < columnOrder.length; i++) {
rowData[columnOrder[i]] = includedColumns[i]?.[row]
}
groupData[row] = rowData
} else {
// return each row as an array

@ -160,7 +160,7 @@ function readVarBigInt(reader) {
* @param {DataReader} reader
* @returns {number} value
*/
function readZigZag(reader) {
export function readZigZag(reader) {
const zigzag = readVarInt(reader)
// convert zigzag to int
return zigzag >>> 1 ^ -(zigzag & 1)
@ -176,7 +176,7 @@ function readZigZag(reader) {
export function readZigZagBigInt(reader) {
const zigzag = readVarBigInt(reader)
// convert zigzag to int
return zigzag >> BigInt(1) ^ -(zigzag & BigInt(1))
return zigzag >> 1n ^ -(zigzag & 1n)
}
/**