2024-04-26 19:52:42 +00:00
|
|
|
import type { AsyncBuffer, Compressors, FileMetaData, SchemaTree } from './types.d.ts'
|
|
|
|
|
|
2024-05-06 00:51:31 +00:00
|
|
|
export type { AsyncBuffer, Compressors, FileMetaData, SchemaTree }
|
2024-01-07 23:33:24 +00:00
|
|
|
|
2024-01-04 19:11:00 +00:00
|
|
|
/**
|
2024-01-15 19:08:48 +00:00
|
|
|
* Read parquet data rows from a file-like object.
|
|
|
|
|
* Reads the minimal number of row groups and columns to satisfy the request.
|
2024-01-04 19:11:00 +00:00
|
|
|
*
|
2024-01-15 19:08:48 +00:00
|
|
|
* Returns a void promise when complete, and to throw errors.
|
|
|
|
|
* Data is returned in onComplete, not the return promise, because
|
|
|
|
|
* if onComplete is undefined, we parse the data, and emit chunks, but skip
|
|
|
|
|
* computing the row view directly. This saves on allocation if the caller
|
|
|
|
|
* wants to cache the full chunks, and make their own view of the data from
|
|
|
|
|
* the chunks.
|
|
|
|
|
*
|
|
|
|
|
* @param {object} options read options
|
|
|
|
|
* @param {AsyncBuffer} options.file file-like object containing parquet data
|
|
|
|
|
* @param {FileMetaData} [options.metadata] parquet file metadata
|
2024-03-14 22:39:00 +00:00
|
|
|
* @param {string[]} [options.columns] columns to read, all columns if undefined
|
2024-01-15 19:08:48 +00:00
|
|
|
* @param {number} [options.rowStart] first requested row index (inclusive)
|
|
|
|
|
* @param {number} [options.rowEnd] last requested row index (exclusive)
|
2024-02-24 19:55:04 +00:00
|
|
|
* @param {Function} [options.onChunk] called when a column chunk is parsed. chunks may include row data outside the requested range.
|
|
|
|
|
* @param {Function} [options.onComplete] called when all requested rows and columns are parsed
|
2024-02-23 18:25:06 +00:00
|
|
|
* @param {Compressors} [options.compressor] custom decompressors
|
2024-01-15 19:08:48 +00:00
|
|
|
* @returns {Promise<void>} resolves when all requested rows and columns are parsed
|
2024-01-04 19:11:00 +00:00
|
|
|
*/
|
2024-04-26 19:52:42 +00:00
|
|
|
export function parquetRead(options: ParquetReadOptions): Promise<void>
|
2024-01-04 19:11:00 +00:00
|
|
|
|
2024-01-15 19:10:26 +00:00
|
|
|
/**
|
|
|
|
|
* Read parquet metadata from an async buffer.
|
|
|
|
|
*
|
|
|
|
|
* An AsyncBuffer is like an ArrayBuffer, but the slices are loaded
|
|
|
|
|
* asynchronously, possibly over the network.
|
|
|
|
|
*
|
2024-02-02 08:06:37 +00:00
|
|
|
* You must provide the byteLength of the buffer, typically from a HEAD request.
|
|
|
|
|
*
|
|
|
|
|
* In theory, you could use suffix-range requests to fetch the end of the file,
|
|
|
|
|
* and save a round trip. But in practice, this doesn't work because chrome
|
|
|
|
|
* deems suffix-range requests as a not-safe-listed header, and will require
|
|
|
|
|
* a pre-flight. So the byteLength is required.
|
|
|
|
|
*
|
2024-01-15 19:10:26 +00:00
|
|
|
* To make this efficient, we initially request the last 512kb of the file,
|
|
|
|
|
* which is likely to contain the metadata. If the metadata length exceeds the
|
|
|
|
|
* initial fetch, 512kb, we request the rest of the metadata from the AsyncBuffer.
|
|
|
|
|
*
|
|
|
|
|
* This ensures that we either make one 512kb initial request for the metadata,
|
2024-02-02 08:06:37 +00:00
|
|
|
* or a second request for up to the metadata size.
|
2024-01-15 19:10:26 +00:00
|
|
|
*
|
|
|
|
|
* @param {AsyncBuffer} asyncBuffer parquet file contents
|
|
|
|
|
* @param {number} initialFetchSize initial fetch size in bytes (default 512kb)
|
2024-01-20 02:51:16 +00:00
|
|
|
* @returns {Promise<FileMetaData>} parquet metadata object
|
2024-01-15 19:10:26 +00:00
|
|
|
*/
|
2024-04-26 19:52:42 +00:00
|
|
|
export function parquetMetadataAsync(asyncBuffer: AsyncBuffer, initialFetchSize?: number): Promise<FileMetaData>
|
2024-01-04 19:11:00 +00:00
|
|
|
|
|
|
|
|
/**
|
2024-01-15 19:10:26 +00:00
|
|
|
* Read parquet metadata from a buffer
|
2024-01-04 19:11:00 +00:00
|
|
|
*
|
|
|
|
|
* @param {ArrayBuffer} arrayBuffer parquet file contents
|
2024-01-20 02:51:16 +00:00
|
|
|
* @returns {FileMetaData} parquet metadata object
|
2024-01-04 19:11:00 +00:00
|
|
|
*/
|
2024-01-07 04:27:18 +00:00
|
|
|
export function parquetMetadata(arrayBuffer: ArrayBuffer): FileMetaData
|
2024-01-04 19:45:37 +00:00
|
|
|
|
2024-01-20 02:51:16 +00:00
|
|
|
/**
|
|
|
|
|
* Return a tree of schema elements from parquet metadata.
|
|
|
|
|
*
|
|
|
|
|
* @param {FileMetaData} metadata parquet metadata object
|
|
|
|
|
* @returns {SchemaTree} tree of schema elements
|
|
|
|
|
*/
|
2024-01-20 17:44:13 +00:00
|
|
|
export function parquetSchema(metadata: FileMetaData): SchemaTree
|
2024-01-20 02:51:16 +00:00
|
|
|
|
2024-01-04 19:45:37 +00:00
|
|
|
/**
|
|
|
|
|
* Decompress snappy data.
|
|
|
|
|
* Accepts an output buffer to avoid allocating a new buffer for each call.
|
|
|
|
|
*
|
2024-02-24 19:55:04 +00:00
|
|
|
* @param {Uint8Array} input compressed data
|
|
|
|
|
* @param {Uint8Array} output output buffer
|
2024-01-04 19:45:37 +00:00
|
|
|
* @returns {boolean} true if successful
|
|
|
|
|
*/
|
2024-02-24 19:55:04 +00:00
|
|
|
export function snappyUncompress(input: Uint8Array, output: Uint8Array): boolean
|
2024-01-05 09:39:59 +00:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Replace bigints with numbers.
|
|
|
|
|
* When parsing parquet files, bigints are used to represent 64-bit integers.
|
|
|
|
|
* However, JSON does not support bigints, so it's helpful to convert to numbers.
|
|
|
|
|
*
|
|
|
|
|
* @param {any} obj object to convert
|
|
|
|
|
* @returns {unknown} converted object
|
|
|
|
|
*/
|
2024-06-01 02:40:44 +00:00
|
|
|
export function toJson(obj: any): any
|
2024-01-15 19:08:48 +00:00
|
|
|
|
2024-07-27 00:02:45 +00:00
|
|
|
/**
|
|
|
|
|
* Construct an AsyncBuffer for a URL.
|
|
|
|
|
*
|
|
|
|
|
* @param {string} url
|
|
|
|
|
* @returns {Promise<AsyncBuffer>}
|
|
|
|
|
*/
|
|
|
|
|
export function asyncBufferFromUrl(url: string): Promise<AsyncBuffer>
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Construct an AsyncBuffer for a local file using node fs package.
|
|
|
|
|
*
|
|
|
|
|
* @param {string} filename
|
|
|
|
|
* @returns {Promise<AsyncBuffer>}
|
|
|
|
|
*/
|
|
|
|
|
export function asyncBufferFromFile(filename: string): Promise<AsyncBuffer>
|
|
|
|
|
|
2024-01-15 19:08:48 +00:00
|
|
|
/**
|
|
|
|
|
* Parquet query options for reading data
|
|
|
|
|
*/
|
|
|
|
|
export interface ParquetReadOptions {
|
|
|
|
|
file: AsyncBuffer // file-like object containing parquet data
|
|
|
|
|
metadata?: FileMetaData // parquet metadata, will be parsed if not provided
|
2024-03-14 22:39:00 +00:00
|
|
|
columns?: string[] // columns to read, all columns if undefined
|
2024-01-15 19:08:48 +00:00
|
|
|
rowStart?: number // inclusive
|
|
|
|
|
rowEnd?: number // exclusive
|
|
|
|
|
onChunk?: (chunk: ColumnData) => void // called when a column chunk is parsed. chunks may be outside the requested range.
|
|
|
|
|
onComplete?: (rows: any[][]) => void // called when all requested rows and columns are parsed
|
2024-02-23 18:25:06 +00:00
|
|
|
compressors?: Compressors // custom decompressors
|
2024-05-23 05:24:54 +00:00
|
|
|
utf8?: boolean // decode byte arrays as utf8 strings (default true)
|
2024-01-15 19:08:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* A run of column data
|
|
|
|
|
*/
|
|
|
|
|
export interface ColumnData {
|
2024-03-14 22:39:00 +00:00
|
|
|
columnName: string
|
|
|
|
|
columnData: ArrayLike<any>
|
2024-01-15 19:08:48 +00:00
|
|
|
rowStart: number
|
|
|
|
|
rowEnd: number
|
|
|
|
|
}
|