mirror of
https://github.com/asadbek064/hyparquet.git
synced 2025-12-15 02:16:37 +00:00
105 lines
3.5 KiB
JavaScript
105 lines
3.5 KiB
JavaScript
|
|
import { concat } from './utils.js'
|
||
|
|
|
||
|
|
// Combine column chunks into a single byte range if less than 32mb
|
||
|
|
const columnChunkAggregation = 1 << 25 // 32mb
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @import {AsyncBuffer, ByteRange, ColumnMetaData, GroupPlan, ParquetReadOptions, QueryPlan} from '../src/types.js'
|
||
|
|
*/
|
||
|
|
/**
|
||
|
|
* Plan which byte ranges to read to satisfy a read request.
|
||
|
|
* Metadata must be non-null.
|
||
|
|
*
|
||
|
|
* @param {ParquetReadOptions} options
|
||
|
|
* @returns {QueryPlan}
|
||
|
|
*/
|
||
|
|
export function parquetPlan({ metadata, rowStart = 0, rowEnd = Infinity, columns }) {
|
||
|
|
if (!metadata) throw new Error('parquetPlan requires metadata')
|
||
|
|
/** @type {GroupPlan[]} */
|
||
|
|
const groups = []
|
||
|
|
/** @type {ByteRange[]} */
|
||
|
|
const ranges = []
|
||
|
|
|
||
|
|
// find which row groups to read
|
||
|
|
let groupStart = 0 // first row index of the current group
|
||
|
|
for (const rowGroup of metadata.row_groups) {
|
||
|
|
const groupEnd = groupStart + Number(rowGroup.num_rows)
|
||
|
|
// if row group overlaps with row range, add it to the plan
|
||
|
|
if (groupEnd >= rowStart && groupStart < rowEnd) {
|
||
|
|
/** @type {ByteRange[]} */
|
||
|
|
const plan = []
|
||
|
|
// loop through each column chunk
|
||
|
|
for (const { meta_data } of rowGroup.columns) {
|
||
|
|
if (!meta_data) throw new Error('parquet column metadata is undefined')
|
||
|
|
// add included columns to the plan
|
||
|
|
if (!columns || columns.includes(meta_data.path_in_schema[0])) {
|
||
|
|
plan.push(getColumnRange(meta_data))
|
||
|
|
}
|
||
|
|
}
|
||
|
|
groups.push({ plan })
|
||
|
|
|
||
|
|
// map group plan to ranges
|
||
|
|
const groupSize = plan[plan.length - 1]?.endByte - plan[0]?.startByte
|
||
|
|
if (!columns && groupSize < columnChunkAggregation) {
|
||
|
|
// full row group
|
||
|
|
ranges.push({
|
||
|
|
startByte: plan[0].startByte,
|
||
|
|
endByte: plan[plan.length - 1].endByte,
|
||
|
|
})
|
||
|
|
} else if (plan.length) {
|
||
|
|
concat(ranges, plan)
|
||
|
|
} else if (columns?.length) {
|
||
|
|
throw new Error(`parquet columns not found: ${columns.join(', ')}`)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
groupStart = groupEnd
|
||
|
|
}
|
||
|
|
|
||
|
|
return { ranges, groups }
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* @param {ColumnMetaData} columnMetadata
|
||
|
|
* @returns {ByteRange}
|
||
|
|
*/
|
||
|
|
export function getColumnRange({ dictionary_page_offset, data_page_offset, total_compressed_size }) {
|
||
|
|
const columnOffset = dictionary_page_offset || data_page_offset
|
||
|
|
return {
|
||
|
|
startByte: Number(columnOffset),
|
||
|
|
endByte: Number(columnOffset + total_compressed_size),
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Prefetch byte ranges from an AsyncBuffer.
|
||
|
|
*
|
||
|
|
* @param {AsyncBuffer} file
|
||
|
|
* @param {QueryPlan} plan
|
||
|
|
* @returns {AsyncBuffer}
|
||
|
|
*/
|
||
|
|
export function prefetchAsyncBuffer(file, { ranges }) {
|
||
|
|
// fetch byte ranges from the file
|
||
|
|
const promises = ranges.map(({ startByte, endByte }) => file.slice(startByte, endByte))
|
||
|
|
return {
|
||
|
|
byteLength: file.byteLength,
|
||
|
|
slice(start, end = file.byteLength) {
|
||
|
|
// find matching slice
|
||
|
|
const index = ranges.findIndex(({ startByte, endByte }) => startByte <= start && end <= endByte)
|
||
|
|
if (index < 0) throw new Error(`no prefetch for range [${start}, ${end}]`)
|
||
|
|
if (ranges[index].startByte !== start || ranges[index].endByte !== end) {
|
||
|
|
// slice a subrange of the prefetch
|
||
|
|
const startOffset = start - ranges[index].startByte
|
||
|
|
const endOffset = end - ranges[index].startByte
|
||
|
|
if (promises[index] instanceof Promise) {
|
||
|
|
return promises[index].then(buffer => buffer.slice(startOffset, endOffset))
|
||
|
|
} else {
|
||
|
|
return promises[index].slice(startOffset, endOffset)
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
return promises[index]
|
||
|
|
}
|
||
|
|
},
|
||
|
|
}
|
||
|
|
}
|