2024-05-18 05:44:03 +00:00
|
|
|
import { assembleNested } from './assemble.js'
|
2024-06-01 02:40:44 +00:00
|
|
|
import { getColumnRange, readColumn } from './column.js'
|
2024-01-15 19:08:48 +00:00
|
|
|
import { parquetMetadataAsync } from './metadata.js'
|
2024-05-18 05:44:03 +00:00
|
|
|
import { getSchemaPath } from './schema.js'
|
2024-04-07 16:33:57 +00:00
|
|
|
import { concat } from './utils.js'
|
2024-01-15 19:08:48 +00:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Read parquet data rows from a file-like object.
|
|
|
|
|
* Reads the minimal number of row groups and columns to satisfy the request.
|
|
|
|
|
*
|
|
|
|
|
* Returns a void promise when complete, and to throw errors.
|
|
|
|
|
* Data is returned in onComplete, not the return promise, because
|
|
|
|
|
* if onComplete is undefined, we parse the data, and emit chunks, but skip
|
|
|
|
|
* computing the row view directly. This saves on allocation if the caller
|
|
|
|
|
* wants to cache the full chunks, and make their own view of the data from
|
|
|
|
|
* the chunks.
|
|
|
|
|
*
|
2024-12-11 00:16:52 +00:00
|
|
|
* @param {ParquetReadOptions} options read options
|
2025-03-04 17:38:39 +00:00
|
|
|
* @returns {Promise<void>} resolves when all requested rows and columns are parsed, all errors are thrown here
|
2024-01-15 19:08:48 +00:00
|
|
|
*/
|
|
|
|
|
export async function parquetRead(options) {
|
2025-03-04 17:38:39 +00:00
|
|
|
if (!options.file || !(options.file.byteLength >= 0)) {
|
|
|
|
|
throw new Error('parquetRead expected file AsyncBuffer')
|
|
|
|
|
}
|
2024-04-05 18:08:10 +00:00
|
|
|
|
2024-01-15 19:08:48 +00:00
|
|
|
// load metadata if not provided
|
|
|
|
|
options.metadata ||= await parquetMetadataAsync(options.file)
|
|
|
|
|
if (!options.metadata) throw new Error('parquet metadata not found')
|
|
|
|
|
|
2024-05-21 06:09:31 +00:00
|
|
|
const { metadata, onComplete, rowEnd } = options
|
2024-01-15 19:08:48 +00:00
|
|
|
const rowStart = options.rowStart || 0
|
2024-04-07 03:01:48 +00:00
|
|
|
/** @type {any[][]} */
|
2024-04-07 16:33:57 +00:00
|
|
|
const rowData = []
|
2024-01-15 19:08:48 +00:00
|
|
|
|
|
|
|
|
// find which row groups to read
|
|
|
|
|
let groupStart = 0 // first row index of the current group
|
|
|
|
|
for (const rowGroup of metadata.row_groups) {
|
|
|
|
|
// number of rows in this row group
|
|
|
|
|
const groupRows = Number(rowGroup.num_rows)
|
|
|
|
|
// if row group overlaps with row range, read it
|
2024-05-21 06:09:31 +00:00
|
|
|
if (groupStart + groupRows >= rowStart && (rowEnd === undefined || groupStart < rowEnd)) {
|
2024-01-15 19:08:48 +00:00
|
|
|
// read row group
|
2024-06-08 02:30:30 +00:00
|
|
|
const rowLimit = rowEnd && rowEnd - groupStart
|
|
|
|
|
const groupData = await readRowGroup(options, rowGroup, groupStart, rowLimit)
|
2024-01-15 19:08:48 +00:00
|
|
|
if (onComplete) {
|
|
|
|
|
// filter to rows in range
|
|
|
|
|
const start = Math.max(rowStart - groupStart, 0)
|
2024-05-21 06:09:31 +00:00
|
|
|
const end = rowEnd === undefined ? undefined : rowEnd - groupStart
|
2024-04-07 16:33:57 +00:00
|
|
|
concat(rowData, groupData.slice(start, end))
|
2024-01-15 19:08:48 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
groupStart += groupRows
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (onComplete) onComplete(rowData)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Read a row group from a file-like object.
|
|
|
|
|
*
|
2024-12-11 00:16:52 +00:00
|
|
|
* @param {ParquetReadOptions} options read options
|
2024-01-15 19:08:48 +00:00
|
|
|
* @param {RowGroup} rowGroup row group to read
|
2024-04-12 20:09:31 +00:00
|
|
|
* @param {number} groupStart row index of the first row in the group
|
2024-06-08 02:30:30 +00:00
|
|
|
* @param {number} [rowLimit] max rows to read from this group
|
2024-01-15 19:08:48 +00:00
|
|
|
* @returns {Promise<any[][]>} resolves to row data
|
|
|
|
|
*/
|
2024-06-08 02:30:30 +00:00
|
|
|
export async function readRowGroup(options, rowGroup, groupStart, rowLimit) {
|
2024-05-23 05:24:54 +00:00
|
|
|
const { file, metadata, columns } = options
|
2024-01-15 19:08:48 +00:00
|
|
|
if (!metadata) throw new Error('parquet metadata not found')
|
2024-06-08 02:30:30 +00:00
|
|
|
if (rowLimit === undefined || rowLimit > rowGroup.num_rows) rowLimit = Number(rowGroup.num_rows)
|
2024-01-15 19:08:48 +00:00
|
|
|
|
|
|
|
|
// loop through metadata to find min/max bytes to read
|
|
|
|
|
let [groupStartByte, groupEndByte] = [file.byteLength, 0]
|
2025-03-17 17:07:08 +00:00
|
|
|
for (const { meta_data } of rowGroup.columns) {
|
|
|
|
|
if (!meta_data) throw new Error('parquet column metadata is undefined')
|
2024-03-14 22:39:00 +00:00
|
|
|
// skip columns that are not requested
|
2025-03-17 17:07:08 +00:00
|
|
|
if (columns && !columns.includes(meta_data.path_in_schema[0])) continue
|
2024-01-15 19:08:48 +00:00
|
|
|
|
2025-03-17 17:07:08 +00:00
|
|
|
const [columnStartByte, columnEndByte] = getColumnRange(meta_data).map(Number)
|
2024-06-01 02:40:44 +00:00
|
|
|
groupStartByte = Math.min(groupStartByte, columnStartByte)
|
|
|
|
|
groupEndByte = Math.max(groupEndByte, columnEndByte)
|
2025-03-17 17:07:08 +00:00
|
|
|
}
|
2024-03-14 22:39:00 +00:00
|
|
|
if (groupStartByte >= groupEndByte && columns?.length) {
|
|
|
|
|
throw new Error(`parquet columns not found: ${columns.join(', ')}`)
|
2024-01-15 19:08:48 +00:00
|
|
|
}
|
2024-04-29 21:22:07 +00:00
|
|
|
// if row group size is less than 32mb, pre-load in one read
|
2024-02-09 21:44:35 +00:00
|
|
|
let groupBuffer
|
2024-04-29 21:22:07 +00:00
|
|
|
if (groupEndByte - groupStartByte <= 1 << 25) {
|
2024-01-15 19:08:48 +00:00
|
|
|
// pre-load row group byte data in one big read,
|
|
|
|
|
// otherwise read column data individually
|
2024-02-09 21:44:35 +00:00
|
|
|
groupBuffer = await file.slice(groupStartByte, groupEndByte)
|
2024-01-15 19:08:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const promises = []
|
2024-05-18 05:44:03 +00:00
|
|
|
// Top-level columns to assemble
|
|
|
|
|
const { children } = getSchemaPath(metadata.schema, [])[0]
|
|
|
|
|
const subcolumnNames = new Map(children.map(child => [child.element.name, getSubcolumns(child)]))
|
2025-03-11 06:33:47 +00:00
|
|
|
/** @type {Map<string, DecodedArray[]>} */
|
2024-05-18 05:44:03 +00:00
|
|
|
const subcolumnData = new Map() // columns to assemble as maps
|
2024-01-15 19:08:48 +00:00
|
|
|
// read column data
|
2025-03-11 06:33:47 +00:00
|
|
|
for (let i = 0; i < rowGroup.columns.length; i++) {
|
|
|
|
|
const columnMetadata = rowGroup.columns[i].meta_data
|
2024-01-15 19:08:48 +00:00
|
|
|
if (!columnMetadata) throw new Error('parquet column metadata is undefined')
|
2024-02-24 19:55:04 +00:00
|
|
|
|
2024-03-14 22:39:00 +00:00
|
|
|
// skip columns that are not requested
|
2024-05-06 20:18:27 +00:00
|
|
|
const columnName = columnMetadata.path_in_schema[0]
|
2024-03-14 22:39:00 +00:00
|
|
|
if (columns && !columns.includes(columnName)) continue
|
|
|
|
|
|
2024-06-01 02:40:44 +00:00
|
|
|
const [columnStartByte, columnEndByte] = getColumnRange(columnMetadata).map(Number)
|
2024-01-15 19:08:48 +00:00
|
|
|
const columnBytes = columnEndByte - columnStartByte
|
2024-01-20 21:52:36 +00:00
|
|
|
|
2024-01-15 19:08:48 +00:00
|
|
|
// skip columns larger than 1gb
|
2024-02-24 19:55:04 +00:00
|
|
|
// TODO: stream process the data, returning only the requested rows
|
2024-01-15 19:08:48 +00:00
|
|
|
if (columnBytes > 1 << 30) {
|
|
|
|
|
console.warn(`parquet skipping huge column "${columnMetadata.path_in_schema}" ${columnBytes.toLocaleString()} bytes`)
|
2024-03-12 02:35:57 +00:00
|
|
|
// TODO: set column to new Error('parquet column too large')
|
2024-01-15 19:08:48 +00:00
|
|
|
continue
|
|
|
|
|
}
|
2024-01-20 21:52:36 +00:00
|
|
|
|
2024-01-15 19:08:48 +00:00
|
|
|
// use pre-loaded row group byte data if available, else read column data
|
2024-04-26 19:52:42 +00:00
|
|
|
/** @type {Promise<ArrayBuffer>} */
|
2024-01-15 19:08:48 +00:00
|
|
|
let buffer
|
2024-02-09 21:44:35 +00:00
|
|
|
let bufferOffset = 0
|
|
|
|
|
if (groupBuffer) {
|
2024-01-15 19:08:48 +00:00
|
|
|
buffer = Promise.resolve(groupBuffer)
|
2024-02-09 21:44:35 +00:00
|
|
|
bufferOffset = columnStartByte - groupStartByte
|
|
|
|
|
} else {
|
2024-04-26 19:52:42 +00:00
|
|
|
// wrap awaitable to ensure it's a promise
|
|
|
|
|
buffer = Promise.resolve(file.slice(columnStartByte, columnEndByte))
|
2024-01-15 19:08:48 +00:00
|
|
|
}
|
2024-01-20 21:52:36 +00:00
|
|
|
|
2024-01-15 19:08:48 +00:00
|
|
|
// read column data async
|
|
|
|
|
promises.push(buffer.then(arrayBuffer => {
|
2024-04-30 01:45:29 +00:00
|
|
|
const schemaPath = getSchemaPath(metadata.schema, columnMetadata.path_in_schema)
|
2024-05-23 05:24:54 +00:00
|
|
|
const reader = { view: new DataView(arrayBuffer), offset: bufferOffset }
|
2025-03-11 06:33:47 +00:00
|
|
|
const columnData = readColumn(reader, rowLimit, columnMetadata, schemaPath, options)
|
|
|
|
|
/** @type {DecodedArray[] | undefined} */
|
|
|
|
|
let chunks = columnData
|
2024-05-18 05:44:03 +00:00
|
|
|
|
|
|
|
|
// TODO: fast path for non-nested columns
|
|
|
|
|
// Save column data for assembly
|
|
|
|
|
const subcolumn = columnMetadata.path_in_schema.join('.')
|
2025-03-11 06:33:47 +00:00
|
|
|
subcolumnData.set(subcolumn, chunks)
|
|
|
|
|
chunks = undefined
|
2024-05-18 05:44:03 +00:00
|
|
|
|
|
|
|
|
const subcolumns = subcolumnNames.get(columnName)
|
|
|
|
|
if (subcolumns?.every(name => subcolumnData.has(name))) {
|
2025-03-11 06:33:47 +00:00
|
|
|
// For every subcolumn, flatten and assemble the column
|
|
|
|
|
const flatData = new Map(subcolumns.map(name => [name, flatten(subcolumnData.get(name))]))
|
|
|
|
|
assembleNested(flatData, schemaPath[1])
|
|
|
|
|
const flatColumn = flatData.get(columnName)
|
2025-03-17 17:07:08 +00:00
|
|
|
if (!flatColumn) throw new Error(`parquet column data not assembled: ${columnName}`)
|
|
|
|
|
chunks = [flatColumn]
|
|
|
|
|
subcolumns.forEach(name => subcolumnData.delete(name))
|
|
|
|
|
subcolumnData.set(columnName, chunks)
|
2024-03-12 02:35:57 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// do not emit column data until structs are fully parsed
|
2025-03-11 06:33:47 +00:00
|
|
|
if (!chunks) return
|
2024-01-15 19:08:48 +00:00
|
|
|
// notify caller of column data
|
2025-03-11 06:33:47 +00:00
|
|
|
for (const chunk of chunks) {
|
|
|
|
|
options.onChunk?.({
|
|
|
|
|
columnName,
|
|
|
|
|
columnData: chunk,
|
|
|
|
|
rowStart: groupStart,
|
|
|
|
|
rowEnd: groupStart + rowLimit,
|
|
|
|
|
})
|
|
|
|
|
}
|
2024-01-15 19:08:48 +00:00
|
|
|
}))
|
|
|
|
|
}
|
|
|
|
|
await Promise.all(promises)
|
2024-05-14 09:19:37 +00:00
|
|
|
if (options.onComplete) {
|
2024-08-13 16:15:59 +00:00
|
|
|
const includedColumnNames = children
|
2024-07-22 19:03:02 +00:00
|
|
|
.map(child => child.element.name)
|
|
|
|
|
.filter(name => !columns || columns.includes(name))
|
2024-08-14 07:01:47 +00:00
|
|
|
const columnOrder = columns || includedColumnNames
|
|
|
|
|
const includedColumns = columnOrder
|
2025-03-11 06:33:47 +00:00
|
|
|
.map(name => includedColumnNames.includes(name) ? flatten(subcolumnData.get(name)) : undefined)
|
2024-08-13 16:15:59 +00:00
|
|
|
|
2025-03-17 17:07:08 +00:00
|
|
|
// transpose columns into rows
|
|
|
|
|
const groupData = new Array(rowLimit)
|
2024-07-22 19:03:02 +00:00
|
|
|
for (let row = 0; row < rowLimit; row++) {
|
2024-08-13 16:15:59 +00:00
|
|
|
if (options.rowFormat === 'object') {
|
|
|
|
|
// return each row as an object
|
|
|
|
|
/** @type {Record<string, any>} */
|
|
|
|
|
const rowData = {}
|
2025-03-17 17:07:08 +00:00
|
|
|
for (let i = 0; i < columnOrder.length; i++) {
|
|
|
|
|
rowData[columnOrder[i]] = includedColumns[i]?.[row]
|
|
|
|
|
}
|
2024-08-13 16:15:59 +00:00
|
|
|
groupData[row] = rowData
|
|
|
|
|
} else {
|
|
|
|
|
// return each row as an array
|
2024-08-14 07:01:47 +00:00
|
|
|
groupData[row] = includedColumns.map(column => column?.[row])
|
2024-08-13 16:15:59 +00:00
|
|
|
}
|
2024-07-22 19:03:02 +00:00
|
|
|
}
|
|
|
|
|
return groupData
|
2024-01-15 19:08:48 +00:00
|
|
|
}
|
2024-05-14 09:19:37 +00:00
|
|
|
return []
|
2024-01-15 19:08:48 +00:00
|
|
|
}
|
2024-05-18 05:44:03 +00:00
|
|
|
|
2025-03-11 06:33:47 +00:00
|
|
|
/**
|
|
|
|
|
* Flatten a list of lists into a single list.
|
|
|
|
|
*
|
|
|
|
|
* @param {DecodedArray[] | undefined} chunks
|
|
|
|
|
* @returns {DecodedArray}
|
|
|
|
|
*/
|
|
|
|
|
function flatten(chunks) {
|
|
|
|
|
if (!chunks) return []
|
|
|
|
|
if (chunks.length === 1) return chunks[0]
|
|
|
|
|
/** @type {any[]} */
|
|
|
|
|
const output = []
|
|
|
|
|
for (const chunk of chunks) {
|
|
|
|
|
concat(output, chunk)
|
|
|
|
|
}
|
|
|
|
|
return output
|
|
|
|
|
}
|
2024-05-18 05:44:03 +00:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Return a list of sub-columns needed to construct a top-level column.
|
|
|
|
|
*
|
2025-03-11 06:33:47 +00:00
|
|
|
* @import {DecodedArray, ParquetReadOptions, RowGroup, SchemaTree} from '../src/types.d.ts'
|
2024-12-02 16:47:42 +00:00
|
|
|
* @param {SchemaTree} schema
|
2024-05-18 05:44:03 +00:00
|
|
|
* @param {string[]} output
|
|
|
|
|
* @returns {string[]}
|
|
|
|
|
*/
|
|
|
|
|
function getSubcolumns(schema, output = []) {
|
|
|
|
|
if (schema.children.length) {
|
|
|
|
|
for (const child of schema.children) {
|
|
|
|
|
getSubcolumns(child, output)
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
output.push(schema.path.join('.'))
|
|
|
|
|
}
|
|
|
|
|
return output
|
|
|
|
|
}
|