2024-01-15 19:08:48 +00:00
|
|
|
import { parquetMetadataAsync } from './metadata.js'
|
2025-05-25 21:49:59 +00:00
|
|
|
import { parquetPlan, prefetchAsyncBuffer } from './plan.js'
|
|
|
|
|
import { readRowGroup } from './rowgroup.js'
|
2024-04-07 16:33:57 +00:00
|
|
|
import { concat } from './utils.js'
|
2024-01-15 19:08:48 +00:00
|
|
|
|
2025-05-25 22:21:58 +00:00
|
|
|
/**
|
|
|
|
|
* @import {ParquetReadOptions} from '../src/types.d.ts'
|
|
|
|
|
*/
|
2024-01-15 19:08:48 +00:00
|
|
|
/**
|
|
|
|
|
* Read parquet data rows from a file-like object.
|
|
|
|
|
* Reads the minimal number of row groups and columns to satisfy the request.
|
|
|
|
|
*
|
2025-05-19 09:13:37 +00:00
|
|
|
* Returns a void promise when complete.
|
|
|
|
|
* Errors are thrown on the returned promise.
|
|
|
|
|
* Data is returned in callbacks onComplete, onChunk, onPage, NOT the return promise.
|
|
|
|
|
* See parquetReadObjects for a more convenient API.
|
2024-01-15 19:08:48 +00:00
|
|
|
*
|
2024-12-11 00:16:52 +00:00
|
|
|
* @param {ParquetReadOptions} options read options
|
2025-03-04 17:38:39 +00:00
|
|
|
* @returns {Promise<void>} resolves when all requested rows and columns are parsed, all errors are thrown here
|
2024-01-15 19:08:48 +00:00
|
|
|
*/
|
|
|
|
|
export async function parquetRead(options) {
|
|
|
|
|
// load metadata if not provided
|
2025-04-30 07:49:40 +00:00
|
|
|
options.metadata ??= await parquetMetadataAsync(options.file)
|
|
|
|
|
const { metadata, onComplete, rowStart = 0, rowEnd } = options
|
2025-05-19 09:13:37 +00:00
|
|
|
if (rowStart < 0) throw new Error('parquetRead rowStart must be positive')
|
2025-04-30 07:49:40 +00:00
|
|
|
|
|
|
|
|
// prefetch byte ranges
|
|
|
|
|
const plan = parquetPlan(options)
|
|
|
|
|
options.file = prefetchAsyncBuffer(options.file, plan)
|
2024-01-15 19:08:48 +00:00
|
|
|
|
2024-04-07 03:01:48 +00:00
|
|
|
/** @type {any[][]} */
|
2024-04-07 16:33:57 +00:00
|
|
|
const rowData = []
|
2024-01-15 19:08:48 +00:00
|
|
|
|
2025-04-30 07:49:40 +00:00
|
|
|
// read row groups
|
2024-01-15 19:08:48 +00:00
|
|
|
let groupStart = 0 // first row index of the current group
|
|
|
|
|
for (const rowGroup of metadata.row_groups) {
|
|
|
|
|
// number of rows in this row group
|
|
|
|
|
const groupRows = Number(rowGroup.num_rows)
|
|
|
|
|
// if row group overlaps with row range, read it
|
2024-05-21 06:09:31 +00:00
|
|
|
if (groupStart + groupRows >= rowStart && (rowEnd === undefined || groupStart < rowEnd)) {
|
2024-01-15 19:08:48 +00:00
|
|
|
// read row group
|
2025-04-06 22:10:31 +00:00
|
|
|
const groupData = await readRowGroup(options, rowGroup, groupStart)
|
2024-01-15 19:08:48 +00:00
|
|
|
if (onComplete) {
|
|
|
|
|
// filter to rows in range
|
|
|
|
|
const start = Math.max(rowStart - groupStart, 0)
|
2024-05-21 06:09:31 +00:00
|
|
|
const end = rowEnd === undefined ? undefined : rowEnd - groupStart
|
2024-04-07 16:33:57 +00:00
|
|
|
concat(rowData, groupData.slice(start, end))
|
2024-01-15 19:08:48 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
groupStart += groupRows
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (onComplete) onComplete(rowData)
|
|
|
|
|
}
|