hyparquet/src/read.js

60 lines
2.2 KiB
JavaScript
Raw Normal View History

2024-01-15 19:08:48 +00:00
import { parquetMetadataAsync } from './metadata.js'
2025-05-25 21:49:59 +00:00
import { parquetPlan, prefetchAsyncBuffer } from './plan.js'
import { readRowGroup } from './rowgroup.js'
2024-04-07 16:33:57 +00:00
import { concat } from './utils.js'
2024-01-15 19:08:48 +00:00
/**
* Read parquet data rows from a file-like object.
* Reads the minimal number of row groups and columns to satisfy the request.
*
* Returns a void promise when complete.
* Errors are thrown on the returned promise.
* Data is returned in callbacks onComplete, onChunk, onPage, NOT the return promise.
* See parquetReadObjects for a more convenient API.
2024-01-15 19:08:48 +00:00
*
* @param {ParquetReadOptions} options read options
2025-03-04 17:38:39 +00:00
* @returns {Promise<void>} resolves when all requested rows and columns are parsed, all errors are thrown here
2024-01-15 19:08:48 +00:00
*/
export async function parquetRead(options) {
2025-03-04 17:38:39 +00:00
if (!options.file || !(options.file.byteLength >= 0)) {
throw new Error('parquetRead expected file AsyncBuffer')
}
2024-01-15 19:08:48 +00:00
// load metadata if not provided
options.metadata ??= await parquetMetadataAsync(options.file)
const { metadata, onComplete, rowStart = 0, rowEnd } = options
if (rowStart < 0) throw new Error('parquetRead rowStart must be positive')
// prefetch byte ranges
const plan = parquetPlan(options)
options.file = prefetchAsyncBuffer(options.file, plan)
2024-01-15 19:08:48 +00:00
/** @type {any[][]} */
2024-04-07 16:33:57 +00:00
const rowData = []
2024-01-15 19:08:48 +00:00
// read row groups
2024-01-15 19:08:48 +00:00
let groupStart = 0 // first row index of the current group
for (const rowGroup of metadata.row_groups) {
// number of rows in this row group
const groupRows = Number(rowGroup.num_rows)
// if row group overlaps with row range, read it
2024-05-21 06:09:31 +00:00
if (groupStart + groupRows >= rowStart && (rowEnd === undefined || groupStart < rowEnd)) {
2024-01-15 19:08:48 +00:00
// read row group
const groupData = await readRowGroup(options, rowGroup, groupStart)
2024-01-15 19:08:48 +00:00
if (onComplete) {
// filter to rows in range
const start = Math.max(rowStart - groupStart, 0)
2024-05-21 06:09:31 +00:00
const end = rowEnd === undefined ? undefined : rowEnd - groupStart
2024-04-07 16:33:57 +00:00
concat(rowData, groupData.slice(start, end))
2024-01-15 19:08:48 +00:00
}
}
groupStart += groupRows
}
if (onComplete) onComplete(rowData)
}
/**
* @import {DecodedArray, ParquetReadOptions, RowGroup, RowGroupSelect, SchemaTree} from '../src/types.d.ts'
2024-05-18 05:44:03 +00:00
*/