return column names in the order requested (#27)

* return column names in the order requested * retain correct ordering of columns in object rows as well
2026-02-23 04:41:33 +00:00 · 2024-08-14 00:01:47 -07:00 · 2024-08-14 00:01:47 -07:00 · 8ace1a47d2
commit 8ace1a47d2
parent d13d52b606
3 changed files with 43 additions and 7 deletions
--- a/README.md
+++ b/README.md
@ -103,14 +103,15 @@ To parse parquet files from a user drag-and-drop action, see example in [index.h

 To read large parquet files, it is recommended that you filter by row and column.
 Hyparquet is designed to load only the minimal amount of data needed to fulfill a query.
-You can filter rows by number, or columns by name:
+You can filter rows by number, or columns by name,
+and columns will be returned in the same order they were requested:

 ```js
 import { parquetRead } from 'hyparquet'

 await parquetRead({
  file,
-  columns: ['colA', 'colB'], // include columns colA and colB
+  columns: ['colB', 'colA'], // include columns colB and colA
  rowStart: 100,
  rowEnd: 200,
  onComplete: data => console.log(data),
--- a/src/read.js
+++ b/src/read.js
@ -190,21 +190,22 @@ export async function readRowGroup(options, rowGroup, groupStart, rowLimit) {
    const includedColumnNames = children
      .map(child => child.element.name)
      .filter(name => !columns || columns.includes(name))
-    const includedColumns = includedColumnNames
-      .map(name => subcolumnData.get(name))
+    const columnOrder = columns || includedColumnNames
+    const includedColumns = columnOrder
+      .map(name => includedColumnNames.includes(name) ? subcolumnData.get(name) : undefined)

    for (let row = 0; row < rowLimit; row++) {
      if (options.rowFormat === 'object') {
        // return each row as an object
        /** @type {Record<string, any>} */
        const rowData = {}
-        includedColumnNames.forEach((name, index) => {
-          rowData[name] = includedColumns[index][row]
+        columnOrder.forEach((name, index) => {
+          rowData[name] = includedColumns[index]?.[row]
        })
        groupData[row] = rowData
      } else {
        // return each row as an array
-        groupData[row] = includedColumns.map(column => column[row])
+        groupData[row] = includedColumns.map(column => column?.[row])
      }
    }
    return groupData
--- a/test/read.test.js
+++ b/test/read.test.js
@ -149,4 +149,38 @@ describe('parquetRead', () => {
      },
    })
  })
+
+  it('read columns out of order', async () => {
+    const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
+    await parquetRead({
+      file,
+      columns: ['c', 'missing', 'b', 'c'],
+      onChunk: chunk => {
+        if (chunk.columnName === 'b') {
+          expect(toJson(chunk)).toEqual({
+            columnName: 'b',
+            columnData: [1, 2, 3, 4, 5],
+            rowStart: 0,
+            rowEnd: 5,
+          })
+        } else {
+          expect(toJson(chunk)).toEqual({
+            columnName: 'c',
+            columnData: [2, 3, 4, 5, 2],
+            rowStart: 0,
+            rowEnd: 5,
+          })
+        }
+      },
+      onComplete: (rows) => {
+        expect(toJson(rows)).toEqual([
+          [2, null, 1, 2],
+          [3, null, 2, 3],
+          [4, null, 3, 4],
+          [5, null, 4, 5],
+          [2, null, 5, 2],
+        ])
+      },
+    })
+  })
 })