diff --git a/README.md b/README.md index 912a0e8..86a1d89 100644 --- a/README.md +++ b/README.md @@ -103,14 +103,15 @@ To parse parquet files from a user drag-and-drop action, see example in [index.h To read large parquet files, it is recommended that you filter by row and column. Hyparquet is designed to load only the minimal amount of data needed to fulfill a query. -You can filter rows by number, or columns by name: +You can filter rows by number, or columns by name, +and columns will be returned in the same order they were requested: ```js import { parquetRead } from 'hyparquet' await parquetRead({ file, - columns: ['colA', 'colB'], // include columns colA and colB + columns: ['colB', 'colA'], // include columns colB and colA rowStart: 100, rowEnd: 200, onComplete: data => console.log(data), diff --git a/src/read.js b/src/read.js index 544e49b..166b4d0 100644 --- a/src/read.js +++ b/src/read.js @@ -190,21 +190,22 @@ export async function readRowGroup(options, rowGroup, groupStart, rowLimit) { const includedColumnNames = children .map(child => child.element.name) .filter(name => !columns || columns.includes(name)) - const includedColumns = includedColumnNames - .map(name => subcolumnData.get(name)) + const columnOrder = columns || includedColumnNames + const includedColumns = columnOrder + .map(name => includedColumnNames.includes(name) ? subcolumnData.get(name) : undefined) for (let row = 0; row < rowLimit; row++) { if (options.rowFormat === 'object') { // return each row as an object /** @type {Record} */ const rowData = {} - includedColumnNames.forEach((name, index) => { - rowData[name] = includedColumns[index][row] + columnOrder.forEach((name, index) => { + rowData[name] = includedColumns[index]?.[row] }) groupData[row] = rowData } else { // return each row as an array - groupData[row] = includedColumns.map(column => column[row]) + groupData[row] = includedColumns.map(column => column?.[row]) } } return groupData diff --git a/test/read.test.js b/test/read.test.js index cd1177d..94c4086 100644 --- a/test/read.test.js +++ b/test/read.test.js @@ -149,4 +149,38 @@ describe('parquetRead', () => { }, }) }) + + it('read columns out of order', async () => { + const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet') + await parquetRead({ + file, + columns: ['c', 'missing', 'b', 'c'], + onChunk: chunk => { + if (chunk.columnName === 'b') { + expect(toJson(chunk)).toEqual({ + columnName: 'b', + columnData: [1, 2, 3, 4, 5], + rowStart: 0, + rowEnd: 5, + }) + } else { + expect(toJson(chunk)).toEqual({ + columnName: 'c', + columnData: [2, 3, 4, 5, 2], + rowStart: 0, + rowEnd: 5, + }) + } + }, + onComplete: (rows) => { + expect(toJson(rows)).toEqual([ + [2, null, 1, 2], + [3, null, 2, 3], + [4, null, 3, 4], + [5, null, 4, 5], + [2, null, 5, 2], + ]) + }, + }) + }) })