Add an option to return each row as an object keyed by column name (#25)

* Add an option to return each row as an object keyed by column name * rename option to rowFormat and address feedback
2026-02-23 21:01:32 +00:00 · 2024-08-13 09:15:59 -07:00 · 2024-08-13 09:15:59 -07:00 · d13d52b606
commit d13d52b606
parent 400c73607c
5 changed files with 65 additions and 8 deletions
--- a/README.md
+++ b/README.md
@ -117,6 +117,21 @@ await parquetRead({
 })
 ```

+## Column names
+
+By default, data returned in the `onComplete` function will be one array of columns per row.
+If you would like each row to be an object with each key the name of the column, set the option `rowFormat` to `object`.
+
+```js
+import { parquetRead } from 'hyparquet'
+
+await parquetRead({
+  file,
+  rowFormat: 'object',
+  onComplete: data => console.log(data),
+})
+```
+
 ## Advanced Usage

 ### AsyncBuffer
--- a/demo/demo.js
+++ b/demo/demo.js
@ -112,7 +112,7 @@ async function render(asyncBuffer, metadata, name) {
    compressors,
    file: asyncBuffer,
    rowEnd: 1000,
-    onComplete(/** @type {any[][]} */ data) {
+    onComplete(/** @type {any[][] | Record<string, any>[]} */ data) {
      const ms = performance.now() - startTime
      console.log(`parsed ${name} in ${ms.toFixed(0)} ms`)
      content.appendChild(renderTable(header, data))
@ -144,7 +144,7 @@ fileInput?.addEventListener('change', () => {

 /**
 * @param {string[]} header
- * @param {any[][]} data
+ * @param {any[][] | Record<string, any>[]} data
 * @returns {HTMLTableElement}
 */
 function renderTable(header, data) {
--- a/src/hyparquet.d.ts
+++ b/src/hyparquet.d.ts
@ -17,6 +17,7 @@ export type { AsyncBuffer, Compressors, FileMetaData, SchemaTree }
 * @param {AsyncBuffer} options.file file-like object containing parquet data
 * @param {FileMetaData} [options.metadata] parquet file metadata
 * @param {string[]} [options.columns] columns to read, all columns if undefined
+ * @param {string} [options.rowFormat] desired format of each row passed to the onComplete function
 * @param {number} [options.rowStart] first requested row index (inclusive)
 * @param {number} [options.rowEnd] last requested row index (exclusive)
 * @param {Function} [options.onChunk] called when a column chunk is parsed. chunks may include row data outside the requested range.
@ -111,10 +112,11 @@ export interface ParquetReadOptions {
  file: AsyncBuffer // file-like object containing parquet data
  metadata?: FileMetaData // parquet metadata, will be parsed if not provided
  columns?: string[] // columns to read, all columns if undefined
+  rowFormat?: string // format of each row passed to the onComplete function
  rowStart?: number // inclusive
  rowEnd?: number // exclusive
  onChunk?: (chunk: ColumnData) => void // called when a column chunk is parsed. chunks may be outside the requested range.
-  onComplete?: (rows: any[][]) => void // called when all requested rows and columns are parsed
+  onComplete?: (rows: any[][] | Record<string, any>[]) => void // called when all requested rows and columns are parsed
  compressors?: Compressors // custom decompressors
  utf8?: boolean // decode byte arrays as utf8 strings (default true)
 }
--- a/src/read.js
+++ b/src/read.js
@ -1,4 +1,3 @@
-
 import { assembleNested } from './assemble.js'
 import { getColumnRange, readColumn } from './column.js'
 import { parquetMetadataAsync } from './metadata.js'
@ -24,10 +23,11 @@ import { concat } from './utils.js'
 * @param {AsyncBuffer} options.file file-like object containing parquet data
 * @param {FileMetaData} [options.metadata] parquet file metadata
 * @param {string[]} [options.columns] columns to read, all columns if undefined
+ * @param {string} [options.rowFormat] format of each row passed to the onComplete function
 * @param {number} [options.rowStart] first requested row index (inclusive)
 * @param {number} [options.rowEnd] last requested row index (exclusive)
 * @param {(chunk: ColumnData) => void} [options.onChunk] called when a column chunk is parsed. chunks may include row data outside the requested range.
- * @param {(rows: any[][]) => void} [options.onComplete] called when all requested rows and columns are parsed
+ * @param {(rows: any[][] | Record<string, any>[]) => void} [options.onComplete] called when all requested rows and columns are parsed
 * @param {Compressors} [options.compressors] custom decompressors
 * @returns {Promise<void>} resolves when all requested rows and columns are parsed
 */
@ -74,8 +74,9 @@ export async function parquetRead(options) {
 * @param {AsyncBuffer} options.file file-like object containing parquet data
 * @param {FileMetaData} [options.metadata] parquet file metadata
 * @param {string[]} [options.columns] columns to read, all columns if undefined
+ * @param {string} [options.rowFormat] format of each row passed to the onComplete function
 * @param {(chunk: ColumnData) => void} [options.onChunk] called when a column chunk is parsed. chunks may include row data outside the requested range.
- * @param {(rows: any[][]) => void} [options.onComplete] called when all requested rows and columns are parsed
+ * @param {(rows: any[][] | Record<string, any>[]) => void} [options.onComplete] called when all requested rows and columns are parsed
 * @param {Compressors} [options.compressors]
 * @param {RowGroup} rowGroup row group to read
 * @param {number} groupStart row index of the first row in the group
@ -186,12 +187,25 @@ export async function readRowGroup(options, rowGroup, groupStart, rowLimit) {
  if (options.onComplete) {
    // transpose columns into rows
    const groupData = new Array(rowLimit)
-    const includedColumns = children
+    const includedColumnNames = children
      .map(child => child.element.name)
      .filter(name => !columns || columns.includes(name))
+    const includedColumns = includedColumnNames
      .map(name => subcolumnData.get(name))
+
    for (let row = 0; row < rowLimit; row++) {
-      groupData[row] = includedColumns.map(column => column[row])
+      if (options.rowFormat === 'object') {
+        // return each row as an object
+        /** @type {Record<string, any>} */
+        const rowData = {}
+        includedColumnNames.forEach((name, index) => {
+          rowData[name] = includedColumns[index][row]
+        })
+        groupData[row] = rowData
+      } else {
+        // return each row as an array
+        groupData[row] = includedColumns.map(column => column[row])
+      }
    }
    return groupData
  }
--- a/test/read.test.js
+++ b/test/read.test.js
@ -123,4 +123,30 @@ describe('parquetRead', () => {
      },
    })
  })
+
+  it('format row as object', async () => {
+    const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
+    await parquetRead({
+      file,
+      columns: ['c'],
+      rowFormat: 'object',
+      onChunk: chunk => {
+        expect(toJson(chunk)).toEqual({
+          columnName: 'c',
+          columnData: [2, 3, 4, 5, 2],
+          rowStart: 0,
+          rowEnd: 5,
+        })
+      },
+      onComplete: (rows) => {
+        expect(toJson(rows)).toEqual([
+          { c: 2 },
+          { c: 3 },
+          { c: 4 },
+          { c: 5 },
+          { c: 2 },
+        ])
+      },
+    })
+  })
 })