hyparquet/src/query.js
Sylvain Lesage 09ae9400c5
build types before publishing to npm (#46)
* build types before publishing to npm

* use prepare instead of prepublishOnly + make it clear that we only build types

doc for prepare vs prepublishOnly is here: https://docs.npmjs.com/cli/v8/using-npm/scripts

* no jsx in this lib

* relative imports from the root, so that it works from types/

* remove unused hyparquet.d.ts + report differences to jsdoc in files

* try to understand if this is the cause of the failing CI check

tsc fails: https://github.com/hyparam/hyparquet/actions/runs/12040954822/job/33571851170?pr=46

* Revert "try to understand if this is the cause of the failing CI check"

This reverts commit 5e2fc8ca179064369de71793ab1cda3facefddc7.

* not sure what happens, but we just need to ensure the types are created correctly

* increment version

* Explicitly export types for use in downstream typescript projects

* Use new typescript jsdoc imports for smaller package

* Combine some files and use @import jsdoc

* use the local typescript

---------

Co-authored-by: Kenny Daniel <platypii@gmail.com>
2024-12-02 17:47:42 +01:00

101 lines
3.3 KiB
JavaScript

import { parquetReadObjects } from './hyparquet.js'
import { parquetMetadataAsync } from './metadata.js'
/**
* Wraps parquetRead with orderBy support.
* This is a parquet-aware query engine that can read a subset of rows and columns.
* Accepts an optional orderBy column name to sort the results.
* Note that using orderBy may SIGNIFICANTLY increase the query time.
*
* @param {ParquetReadOptions & { orderBy?: string }} options
* @returns {Promise<Record<string, any>[]>} resolves when all requested rows and columns are parsed
*/
export async function parquetQuery(options) {
const { file, rowStart, rowEnd, orderBy } = options
options.metadata ||= await parquetMetadataAsync(file)
// TODO: Faster path for: no orderBy, no rowStart/rowEnd, one row group
if (typeof orderBy === 'string') {
// Fetch orderBy column first
const orderColumn = await parquetReadObjects({ ...options, rowStart: undefined, rowEnd: undefined, columns: [orderBy] })
// Compute row groups to fetch
const sortedIndices = Array.from(orderColumn, (_, index) => index)
.sort((a, b) => compare(orderColumn[a][orderBy], orderColumn[b][orderBy]))
.slice(rowStart, rowEnd)
const sparseData = await parquetReadRows({ ...options, rows: sortedIndices })
const data = sortedIndices.map(index => sparseData[index])
return data
} else {
return await parquetReadObjects(options)
}
}
/**
* Reads a list rows from a parquet file, reading only the row groups that contain the rows.
* Returns a sparse array of rows.
* @import {ParquetReadOptions} from '../src/types.d.ts'
* @param {ParquetReadOptions & { rows: number[] }} options
* @returns {Promise<Record<string, any>[]>}
*/
async function parquetReadRows(options) {
const { file, rows } = options
options.metadata ||= await parquetMetadataAsync(file)
const { row_groups: rowGroups } = options.metadata
// Compute row groups to fetch
const groupIncluded = Array(rowGroups.length).fill(false)
let groupStart = 0
const groupEnds = rowGroups.map(group => groupStart += Number(group.num_rows))
for (const index of rows) {
const groupIndex = groupEnds.findIndex(end => index < end)
groupIncluded[groupIndex] = true
}
// Compute row ranges to fetch
const rowRanges = []
let rangeStart
groupStart = 0
for (let i = 0; i < groupIncluded.length; i++) {
const groupEnd = groupStart + Number(rowGroups[i].num_rows)
if (groupIncluded[i]) {
if (rangeStart === undefined) {
rangeStart = groupStart
}
} else {
if (rangeStart !== undefined) {
rowRanges.push([rangeStart, groupEnd])
rangeStart = undefined
}
}
groupStart = groupEnd
}
if (rangeStart !== undefined) {
rowRanges.push([rangeStart, groupStart])
}
// Fetch by row group and map to rows
const sparseData = new Array(Number(options.metadata.num_rows))
for (const [rangeStart, rangeEnd] of rowRanges) {
// TODO: fetch in parallel
const groupData = await parquetReadObjects({ ...options, rowStart: rangeStart, rowEnd: rangeEnd })
for (let i = rangeStart; i < rangeEnd; i++) {
sparseData[i] = groupData[i - rangeStart]
sparseData[i].__index__ = i
}
}
return sparseData
}
/**
* @param {any} a
* @param {any} b
* @returns {number}
*/
function compare(a, b) {
if (a < b) return -1
if (a > b) return 1
return 1 // TODO: how to handle nulls?
}