hyparquet/src/hyparquet.d.ts

131 lines
5.0 KiB
TypeScript
Raw Normal View History

2024-04-26 19:52:42 +00:00
import type { AsyncBuffer, Compressors, FileMetaData, SchemaTree } from './types.d.ts'
2024-05-06 00:51:31 +00:00
export type { AsyncBuffer, Compressors, FileMetaData, SchemaTree }
2024-01-07 23:33:24 +00:00
2024-01-04 19:11:00 +00:00
/**
2024-01-15 19:08:48 +00:00
* Read parquet data rows from a file-like object.
* Reads the minimal number of row groups and columns to satisfy the request.
2024-01-04 19:11:00 +00:00
*
2024-01-15 19:08:48 +00:00
* Returns a void promise when complete, and to throw errors.
* Data is returned in onComplete, not the return promise, because
* if onComplete is undefined, we parse the data, and emit chunks, but skip
* computing the row view directly. This saves on allocation if the caller
* wants to cache the full chunks, and make their own view of the data from
* the chunks.
*
* @param {object} options read options
* @param {AsyncBuffer} options.file file-like object containing parquet data
* @param {FileMetaData} [options.metadata] parquet file metadata
2024-03-14 22:39:00 +00:00
* @param {string[]} [options.columns] columns to read, all columns if undefined
2024-01-15 19:08:48 +00:00
* @param {number} [options.rowStart] first requested row index (inclusive)
* @param {number} [options.rowEnd] last requested row index (exclusive)
* @param {Function} [options.onChunk] called when a column chunk is parsed. chunks may include row data outside the requested range.
* @param {Function} [options.onComplete] called when all requested rows and columns are parsed
2024-02-23 18:25:06 +00:00
* @param {Compressors} [options.compressor] custom decompressors
2024-01-15 19:08:48 +00:00
* @returns {Promise<void>} resolves when all requested rows and columns are parsed
2024-01-04 19:11:00 +00:00
*/
2024-04-26 19:52:42 +00:00
export function parquetRead(options: ParquetReadOptions): Promise<void>
2024-01-04 19:11:00 +00:00
2024-01-15 19:10:26 +00:00
/**
* Read parquet metadata from an async buffer.
*
* An AsyncBuffer is like an ArrayBuffer, but the slices are loaded
* asynchronously, possibly over the network.
*
* You must provide the byteLength of the buffer, typically from a HEAD request.
*
* In theory, you could use suffix-range requests to fetch the end of the file,
* and save a round trip. But in practice, this doesn't work because chrome
* deems suffix-range requests as a not-safe-listed header, and will require
* a pre-flight. So the byteLength is required.
*
2024-01-15 19:10:26 +00:00
* To make this efficient, we initially request the last 512kb of the file,
* which is likely to contain the metadata. If the metadata length exceeds the
* initial fetch, 512kb, we request the rest of the metadata from the AsyncBuffer.
*
* This ensures that we either make one 512kb initial request for the metadata,
* or a second request for up to the metadata size.
2024-01-15 19:10:26 +00:00
*
* @param {AsyncBuffer} asyncBuffer parquet file contents
* @param {number} initialFetchSize initial fetch size in bytes (default 512kb)
2024-01-20 02:51:16 +00:00
* @returns {Promise<FileMetaData>} parquet metadata object
2024-01-15 19:10:26 +00:00
*/
2024-04-26 19:52:42 +00:00
export function parquetMetadataAsync(asyncBuffer: AsyncBuffer, initialFetchSize?: number): Promise<FileMetaData>
2024-01-04 19:11:00 +00:00
/**
2024-01-15 19:10:26 +00:00
* Read parquet metadata from a buffer
2024-01-04 19:11:00 +00:00
*
* @param {ArrayBuffer} arrayBuffer parquet file contents
2024-01-20 02:51:16 +00:00
* @returns {FileMetaData} parquet metadata object
2024-01-04 19:11:00 +00:00
*/
2024-01-07 04:27:18 +00:00
export function parquetMetadata(arrayBuffer: ArrayBuffer): FileMetaData
2024-01-04 19:45:37 +00:00
2024-01-20 02:51:16 +00:00
/**
* Return a tree of schema elements from parquet metadata.
*
* @param {FileMetaData} metadata parquet metadata object
* @returns {SchemaTree} tree of schema elements
*/
2024-01-20 17:44:13 +00:00
export function parquetSchema(metadata: FileMetaData): SchemaTree
2024-01-20 02:51:16 +00:00
2024-01-04 19:45:37 +00:00
/**
* Decompress snappy data.
* Accepts an output buffer to avoid allocating a new buffer for each call.
*
* @param {Uint8Array} input compressed data
* @param {Uint8Array} output output buffer
2024-01-04 19:45:37 +00:00
* @returns {boolean} true if successful
*/
export function snappyUncompress(input: Uint8Array, output: Uint8Array): boolean
2024-01-05 09:39:59 +00:00
/**
* Replace bigints with numbers.
* When parsing parquet files, bigints are used to represent 64-bit integers.
* However, JSON does not support bigints, so it's helpful to convert to numbers.
*
* @param {any} obj object to convert
* @returns {unknown} converted object
*/
export function toJson(obj: any): any
2024-01-15 19:08:48 +00:00
/**
* Construct an AsyncBuffer for a URL.
*
* @param {string} url
* @returns {Promise<AsyncBuffer>}
*/
export function asyncBufferFromUrl(url: string): Promise<AsyncBuffer>
/**
* Construct an AsyncBuffer for a local file using node fs package.
*
* @param {string} filename
* @returns {Promise<AsyncBuffer>}
*/
export function asyncBufferFromFile(filename: string): Promise<AsyncBuffer>
2024-01-15 19:08:48 +00:00
/**
* Parquet query options for reading data
*/
export interface ParquetReadOptions {
file: AsyncBuffer // file-like object containing parquet data
metadata?: FileMetaData // parquet metadata, will be parsed if not provided
2024-03-14 22:39:00 +00:00
columns?: string[] // columns to read, all columns if undefined
2024-01-15 19:08:48 +00:00
rowStart?: number // inclusive
rowEnd?: number // exclusive
onChunk?: (chunk: ColumnData) => void // called when a column chunk is parsed. chunks may be outside the requested range.
onComplete?: (rows: any[][]) => void // called when all requested rows and columns are parsed
2024-02-23 18:25:06 +00:00
compressors?: Compressors // custom decompressors
2024-05-23 05:24:54 +00:00
utf8?: boolean // decode byte arrays as utf8 strings (default true)
2024-01-15 19:08:48 +00:00
}
/**
* A run of column data
*/
export interface ColumnData {
2024-03-14 22:39:00 +00:00
columnName: string
columnData: ArrayLike<any>
2024-01-15 19:08:48 +00:00
rowStart: number
rowEnd: number
}