mirror of
https://github.com/asadbek064/hyparquet.git
synced 2025-12-26 23:26:38 +00:00
return column names in the order requested (#27)
* return column names in the order requested * retain correct ordering of columns in object rows as well
This commit is contained in:
parent
d13d52b606
commit
8ace1a47d2
@ -103,14 +103,15 @@ To parse parquet files from a user drag-and-drop action, see example in [index.h
|
||||
|
||||
To read large parquet files, it is recommended that you filter by row and column.
|
||||
Hyparquet is designed to load only the minimal amount of data needed to fulfill a query.
|
||||
You can filter rows by number, or columns by name:
|
||||
You can filter rows by number, or columns by name,
|
||||
and columns will be returned in the same order they were requested:
|
||||
|
||||
```js
|
||||
import { parquetRead } from 'hyparquet'
|
||||
|
||||
await parquetRead({
|
||||
file,
|
||||
columns: ['colA', 'colB'], // include columns colA and colB
|
||||
columns: ['colB', 'colA'], // include columns colB and colA
|
||||
rowStart: 100,
|
||||
rowEnd: 200,
|
||||
onComplete: data => console.log(data),
|
||||
|
||||
11
src/read.js
11
src/read.js
@ -190,21 +190,22 @@ export async function readRowGroup(options, rowGroup, groupStart, rowLimit) {
|
||||
const includedColumnNames = children
|
||||
.map(child => child.element.name)
|
||||
.filter(name => !columns || columns.includes(name))
|
||||
const includedColumns = includedColumnNames
|
||||
.map(name => subcolumnData.get(name))
|
||||
const columnOrder = columns || includedColumnNames
|
||||
const includedColumns = columnOrder
|
||||
.map(name => includedColumnNames.includes(name) ? subcolumnData.get(name) : undefined)
|
||||
|
||||
for (let row = 0; row < rowLimit; row++) {
|
||||
if (options.rowFormat === 'object') {
|
||||
// return each row as an object
|
||||
/** @type {Record<string, any>} */
|
||||
const rowData = {}
|
||||
includedColumnNames.forEach((name, index) => {
|
||||
rowData[name] = includedColumns[index][row]
|
||||
columnOrder.forEach((name, index) => {
|
||||
rowData[name] = includedColumns[index]?.[row]
|
||||
})
|
||||
groupData[row] = rowData
|
||||
} else {
|
||||
// return each row as an array
|
||||
groupData[row] = includedColumns.map(column => column[row])
|
||||
groupData[row] = includedColumns.map(column => column?.[row])
|
||||
}
|
||||
}
|
||||
return groupData
|
||||
|
||||
@ -149,4 +149,38 @@ describe('parquetRead', () => {
|
||||
},
|
||||
})
|
||||
})
|
||||
|
||||
it('read columns out of order', async () => {
|
||||
const file = await asyncBufferFromFile('test/files/datapage_v2.snappy.parquet')
|
||||
await parquetRead({
|
||||
file,
|
||||
columns: ['c', 'missing', 'b', 'c'],
|
||||
onChunk: chunk => {
|
||||
if (chunk.columnName === 'b') {
|
||||
expect(toJson(chunk)).toEqual({
|
||||
columnName: 'b',
|
||||
columnData: [1, 2, 3, 4, 5],
|
||||
rowStart: 0,
|
||||
rowEnd: 5,
|
||||
})
|
||||
} else {
|
||||
expect(toJson(chunk)).toEqual({
|
||||
columnName: 'c',
|
||||
columnData: [2, 3, 4, 5, 2],
|
||||
rowStart: 0,
|
||||
rowEnd: 5,
|
||||
})
|
||||
}
|
||||
},
|
||||
onComplete: (rows) => {
|
||||
expect(toJson(rows)).toEqual([
|
||||
[2, null, 1, 2],
|
||||
[3, null, 2, 3],
|
||||
[4, null, 3, 4],
|
||||
[5, null, 4, 5],
|
||||
[2, null, 5, 2],
|
||||
])
|
||||
},
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
Loading…
Reference in New Issue
Block a user