2025-05-27 00:27:15 +00:00
|
|
|
import { parquetMetadataAsync, parquetSchema } from './metadata.js'
|
2025-05-25 21:49:59 +00:00
|
|
|
import { parquetPlan, prefetchAsyncBuffer } from './plan.js'
|
2025-05-27 00:27:15 +00:00
|
|
|
import { assembleAsync, asyncGroupToRows, readRowGroup } from './rowgroup.js'
|
|
|
|
|
import { concat, flatten } from './utils.js'
|
2024-01-15 19:08:48 +00:00
|
|
|
|
2025-05-25 22:21:58 +00:00
|
|
|
/**
|
2025-09-16 22:29:44 +00:00
|
|
|
* @import {AsyncRowGroup, DecodedArray, ParquetReadOptions, BaseParquetReadOptions} from '../src/types.js'
|
2025-05-25 22:21:58 +00:00
|
|
|
*/
|
2024-01-15 19:08:48 +00:00
|
|
|
/**
|
|
|
|
|
* Read parquet data rows from a file-like object.
|
|
|
|
|
* Reads the minimal number of row groups and columns to satisfy the request.
|
|
|
|
|
*
|
2025-05-19 09:13:37 +00:00
|
|
|
* Returns a void promise when complete.
|
|
|
|
|
* Errors are thrown on the returned promise.
|
|
|
|
|
* Data is returned in callbacks onComplete, onChunk, onPage, NOT the return promise.
|
|
|
|
|
* See parquetReadObjects for a more convenient API.
|
2024-01-15 19:08:48 +00:00
|
|
|
*
|
2024-12-11 00:16:52 +00:00
|
|
|
* @param {ParquetReadOptions} options read options
|
2025-03-04 17:38:39 +00:00
|
|
|
* @returns {Promise<void>} resolves when all requested rows and columns are parsed, all errors are thrown here
|
2024-01-15 19:08:48 +00:00
|
|
|
*/
|
|
|
|
|
export async function parquetRead(options) {
|
|
|
|
|
// load metadata if not provided
|
2025-04-30 07:49:40 +00:00
|
|
|
options.metadata ??= await parquetMetadataAsync(options.file)
|
2025-05-27 00:27:15 +00:00
|
|
|
|
|
|
|
|
// read row groups
|
2025-08-15 20:09:00 +00:00
|
|
|
const asyncGroups = parquetReadAsync(options)
|
2025-05-27 00:27:15 +00:00
|
|
|
|
|
|
|
|
const { rowStart = 0, rowEnd, columns, onChunk, onComplete, rowFormat } = options
|
|
|
|
|
|
|
|
|
|
// skip assembly if no onComplete or onChunk, but wait for reading to finish
|
|
|
|
|
if (!onComplete && !onChunk) {
|
|
|
|
|
for (const { asyncColumns } of asyncGroups) {
|
|
|
|
|
for (const { data } of asyncColumns) await data
|
|
|
|
|
}
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// assemble struct columns
|
|
|
|
|
const schemaTree = parquetSchema(options.metadata)
|
|
|
|
|
const assembled = asyncGroups.map(arg => assembleAsync(arg, schemaTree))
|
|
|
|
|
|
|
|
|
|
// onChunk emit all chunks (don't await)
|
|
|
|
|
if (onChunk) {
|
|
|
|
|
for (const asyncGroup of assembled) {
|
|
|
|
|
for (const asyncColumn of asyncGroup.asyncColumns) {
|
|
|
|
|
asyncColumn.data.then(columnDatas => {
|
|
|
|
|
let rowStart = asyncGroup.groupStart
|
|
|
|
|
for (const columnData of columnDatas) {
|
|
|
|
|
onChunk({
|
|
|
|
|
columnName: asyncColumn.pathInSchema[0],
|
|
|
|
|
columnData,
|
|
|
|
|
rowStart,
|
|
|
|
|
rowEnd: rowStart + columnData.length,
|
|
|
|
|
})
|
|
|
|
|
rowStart += columnData.length
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// onComplete transpose column chunks to rows
|
|
|
|
|
if (onComplete) {
|
2025-09-16 22:29:44 +00:00
|
|
|
// loosen the types to avoid duplicate code
|
|
|
|
|
/** @type {any[]} */
|
2025-05-27 00:27:15 +00:00
|
|
|
const rows = []
|
|
|
|
|
for (const asyncGroup of assembled) {
|
|
|
|
|
// filter to rows in range
|
|
|
|
|
const selectStart = Math.max(rowStart - asyncGroup.groupStart, 0)
|
|
|
|
|
const selectEnd = Math.min((rowEnd ?? Infinity) - asyncGroup.groupStart, asyncGroup.groupRows)
|
|
|
|
|
// transpose column chunks to rows in output
|
2025-09-16 22:29:44 +00:00
|
|
|
const groupData = rowFormat === 'object' ?
|
|
|
|
|
await asyncGroupToRows(asyncGroup, selectStart, selectEnd, columns, 'object') :
|
|
|
|
|
await asyncGroupToRows(asyncGroup, selectStart, selectEnd, columns, 'array')
|
|
|
|
|
concat(rows, groupData)
|
2025-05-27 00:27:15 +00:00
|
|
|
}
|
|
|
|
|
onComplete(rows)
|
|
|
|
|
} else {
|
|
|
|
|
// wait for all async groups to finish (complete takes care of this)
|
|
|
|
|
for (const { asyncColumns } of assembled) {
|
|
|
|
|
for (const { data } of asyncColumns) await data
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param {ParquetReadOptions} options read options
|
|
|
|
|
* @returns {AsyncRowGroup[]}
|
|
|
|
|
*/
|
|
|
|
|
export function parquetReadAsync(options) {
|
|
|
|
|
if (!options.metadata) throw new Error('parquet requires metadata')
|
|
|
|
|
// TODO: validate options (start, end, columns, etc)
|
2025-04-30 07:49:40 +00:00
|
|
|
|
|
|
|
|
// prefetch byte ranges
|
|
|
|
|
const plan = parquetPlan(options)
|
|
|
|
|
options.file = prefetchAsyncBuffer(options.file, plan)
|
2024-01-15 19:08:48 +00:00
|
|
|
|
2025-04-30 07:49:40 +00:00
|
|
|
// read row groups
|
2025-05-27 00:27:15 +00:00
|
|
|
return plan.groups.map(groupPlan => readRowGroup(options, plan, groupPlan))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Reads a single column from a parquet file.
|
|
|
|
|
*
|
2025-09-16 22:29:44 +00:00
|
|
|
* @param {BaseParquetReadOptions} options
|
2025-05-27 00:27:15 +00:00
|
|
|
* @returns {Promise<DecodedArray>}
|
|
|
|
|
*/
|
|
|
|
|
export async function parquetReadColumn(options) {
|
|
|
|
|
if (options.columns?.length !== 1) {
|
|
|
|
|
throw new Error('parquetReadColumn expected columns: [columnName]')
|
2024-01-15 19:08:48 +00:00
|
|
|
}
|
2025-05-27 00:27:15 +00:00
|
|
|
options.metadata ??= await parquetMetadataAsync(options.file)
|
|
|
|
|
const asyncGroups = parquetReadAsync(options)
|
|
|
|
|
|
|
|
|
|
// assemble struct columns
|
|
|
|
|
const schemaTree = parquetSchema(options.metadata)
|
|
|
|
|
const assembled = asyncGroups.map(arg => assembleAsync(arg, schemaTree))
|
2024-01-15 19:08:48 +00:00
|
|
|
|
2025-05-27 00:27:15 +00:00
|
|
|
/** @type {DecodedArray[]} */
|
|
|
|
|
const columnData = []
|
|
|
|
|
for (const rg of assembled) {
|
|
|
|
|
columnData.push(flatten(await rg.asyncColumns[0].data))
|
|
|
|
|
}
|
|
|
|
|
return flatten(columnData)
|
2024-01-15 19:08:48 +00:00
|
|
|
}
|
2025-08-15 20:09:00 +00:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* This is a helper function to read parquet row data as a promise.
|
|
|
|
|
* It is a wrapper around the more configurable parquetRead function.
|
|
|
|
|
*
|
|
|
|
|
* @param {Omit<ParquetReadOptions, 'onComplete'>} options
|
2025-09-04 05:15:51 +00:00
|
|
|
* @returns {Promise<Record<string, any>[]>} resolves when all requested rows and columns are parsed
|
2025-09-16 22:29:44 +00:00
|
|
|
*/
|
2025-08-15 20:09:00 +00:00
|
|
|
export function parquetReadObjects(options) {
|
|
|
|
|
return new Promise((onComplete, reject) => {
|
|
|
|
|
parquetRead({
|
|
|
|
|
...options,
|
2025-09-16 22:29:44 +00:00
|
|
|
rowFormat: 'object', // force object output
|
2025-08-15 20:09:00 +00:00
|
|
|
onComplete,
|
|
|
|
|
}).catch(reject)
|
|
|
|
|
})
|
|
|
|
|
}
|