hyparquet/src/hyparquet.d.ts

import type { AsyncBuffer, CompressionCodec, Compressors, ConvertedType, FileMetaData, LogicalType, ParquetType, SchemaTree } from './types.d.ts'

export type { AsyncBuffer, CompressionCodec, Compressors, ConvertedType, FileMetaData, LogicalType, ParquetType, SchemaTree }

/**
 * Read parquet data rows from a file-like object.
 * Reads the minimal number of row groups and columns to satisfy the request.
 *
 * Returns a void promise when complete, and to throw errors.
 * Data is returned in onComplete, not the return promise, because
 * if onComplete is undefined, we parse the data, and emit chunks, but skip
 * computing the row view directly. This saves on allocation if the caller
 * wants to cache the full chunks, and make their own view of the data from
 * the chunks.
 *
 * @param {object} options read options
 * @param {AsyncBuffer} options.file file-like object containing parquet data
 * @param {FileMetaData} [options.metadata] parquet file metadata
 * @param {string[]} [options.columns] columns to read, all columns if undefined
 * @param {string} [options.rowFormat] desired format of each row passed to the onComplete function
 * @param {number} [options.rowStart] first requested row index (inclusive)
 * @param {number} [options.rowEnd] last requested row index (exclusive)
 * @param {Function} [options.onChunk] called when a column chunk is parsed. chunks may include row data outside the requested range.
 * @param {Function} [options.onComplete] called when all requested rows and columns are parsed
 * @param {Compressors} [options.compressor] custom decompressors
 * @returns {Promise<void>} resolves when all requested rows and columns are parsed
 */
export function parquetRead(options: ParquetReadOptions): Promise<void>

/**
 * Read parquet data and return a Promise of object-oriented row data.
 *
 * @param {object} options read options
 * @param {AsyncBuffer} options.file file-like object containing parquet data
 * @param {FileMetaData} [options.metadata] parquet file metadata
 * @param {string[]} [options.columns] columns to read, all columns if undefined
 * @param {number} [options.rowStart] first requested row index (inclusive)
 * @param {number} [options.rowEnd] last requested row index (exclusive)
 * @param {Compressors} [options.compressor] custom decompressors
 * @returns {Promise<void>} resolves when all requested rows and columns are parsed
 */
export function parquetReadObjects(options: ParquetReadOptions): Promise<Array<Record<string, any>>>

/**
 * Wraps parquetRead with orderBy support.
 * This is a parquet-aware query engine that can read a subset of rows and columns.
 * Accepts an optional orderBy column name to sort the results.
 * Note that using orderBy may SIGNIFICANTLY increase the query time.
 *
 * @param {ParquetReadOptions & { orderBy?: string }} options
 * @returns {Promise<Record<string, any>[]>} resolves when all requested rows and columns are parsed
 */
export function parquetQuery(options: ParquetReadOptions & { orderBy?: string }): Promise<Array<Record<string, any>>>

/**
 * Read parquet metadata from an async buffer.
 *
 * An AsyncBuffer is like an ArrayBuffer, but the slices are loaded
 * asynchronously, possibly over the network.
 *
 * You must provide the byteLength of the buffer, typically from a HEAD request.
 *
 * In theory, you could use suffix-range requests to fetch the end of the file,
 * and save a round trip. But in practice, this doesn't work because chrome
 * deems suffix-range requests as a not-safe-listed header, and will require
 * a pre-flight. So the byteLength is required.
 *
 * To make this efficient, we initially request the last 512kb of the file,
 * which is likely to contain the metadata. If the metadata length exceeds the
 * initial fetch, 512kb, we request the rest of the metadata from the AsyncBuffer.
 *
 * This ensures that we either make one 512kb initial request for the metadata,
 * or a second request for up to the metadata size.
 *
 * @param {AsyncBuffer} asyncBuffer parquet file contents
 * @param {number} initialFetchSize initial fetch size in bytes (default 512kb)
 * @returns {Promise<FileMetaData>} parquet metadata object
 */
export function parquetMetadataAsync(asyncBuffer: AsyncBuffer, initialFetchSize?: number): Promise<FileMetaData>

/**
 * Read parquet metadata from a buffer
 *
 * @param {ArrayBuffer} arrayBuffer parquet file contents
 * @returns {FileMetaData} parquet metadata object
 */
export function parquetMetadata(arrayBuffer: ArrayBuffer): FileMetaData

/**
 * Return a tree of schema elements from parquet metadata.
 *
 * @param {FileMetaData} metadata parquet metadata object
 * @returns {SchemaTree} tree of schema elements
 */
export function parquetSchema(metadata: FileMetaData): SchemaTree

/**
 * Decompress snappy data.
 * Accepts an output buffer to avoid allocating a new buffer for each call.
 *
 * @param {Uint8Array} input compressed data
 * @param {Uint8Array} output output buffer
 * @returns {boolean} true if successful
 */
export function snappyUncompress(input: Uint8Array, output: Uint8Array): boolean

/**
 * Replace bigints with numbers.
 * When parsing parquet files, bigints are used to represent 64-bit integers.
 * However, JSON does not support bigints, so it's helpful to convert to numbers.
 *
 * @param {any} obj object to convert
 * @returns {unknown} converted object
 */
export function toJson(obj: any): any

/**
 * Construct an AsyncBuffer for a URL.
 * If byteLength is not provided, will make a HEAD request to get the file size.
 */
export function asyncBufferFromUrl(url: string, byteLength?: number): Promise<AsyncBuffer>

/**
 * Construct an AsyncBuffer for a local file using node fs package.
 */
export function asyncBufferFromFile(filename: string): Promise<AsyncBuffer>

/**
 * Get the byte length of a URL using a HEAD request.
 */
export function byteLengthFromUrl(url: string): Promise<number>

/**
 * Returns a cached layer on top of an AsyncBuffer.
 */
export function cachedAsyncBuffer(asyncBuffer: AsyncBuffer): AsyncBuffer

/**
 * Parquet query options for reading data
 */
export interface ParquetReadOptions {
  file: AsyncBuffer // file-like object containing parquet data
  metadata?: FileMetaData // parquet metadata, will be parsed if not provided
  columns?: string[] // columns to read, all columns if undefined
  rowFormat?: string // format of each row passed to the onComplete function
  rowStart?: number // inclusive
  rowEnd?: number // exclusive
  onChunk?: (chunk: ColumnData) => void // called when a column chunk is parsed. chunks may be outside the requested range.
  onComplete?: (rows: any[][]) => void // called when all requested rows and columns are parsed
  compressors?: Compressors // custom decompressors
  utf8?: boolean // decode byte arrays as utf8 strings (default true)
}

/**
 * A run of column data
 */
export interface ColumnData {
  columnName: string
  columnData: ArrayLike<any>
  rowStart: number
  rowEnd: number
}
Export additional types Signed-off-by: Matthew Peveler <mpeveler@timescale.com> 2024-10-04 21:00:33 +00:00			`import type { AsyncBuffer, CompressionCodec, Compressors, ConvertedType, FileMetaData, LogicalType, ParquetType, SchemaTree } from './types.d.ts'`
Fix typescript definitions 2024-04-26 19:52:42 +00:00
Export additional types Signed-off-by: Matthew Peveler <mpeveler@timescale.com> 2024-10-04 21:00:33 +00:00			`export type { AsyncBuffer, CompressionCodec, Compressors, ConvertedType, FileMetaData, LogicalType, ParquetType, SchemaTree }`
Parquet data page parser 2024-01-07 23:33:24 +00:00
All javascript, no typescript 2024-01-04 19:11:00 +00:00			`/**`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`* Read parquet data rows from a file-like object.`
			`* Reads the minimal number of row groups and columns to satisfy the request.`
All javascript, no typescript 2024-01-04 19:11:00 +00:00			`*`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`* Returns a void promise when complete, and to throw errors.`
			`* Data is returned in onComplete, not the return promise, because`
			`* if onComplete is undefined, we parse the data, and emit chunks, but skip`
			`* computing the row view directly. This saves on allocation if the caller`
			`* wants to cache the full chunks, and make their own view of the data from`
			`* the chunks.`
			`*`
			`* @param {object} options read options`
			`* @param {AsyncBuffer} options.file file-like object containing parquet data`
			`* @param {FileMetaData} [options.metadata] parquet file metadata`
Column filter by name 2024-03-14 22:39:00 +00:00			`* @param {string[]} [options.columns] columns to read, all columns if undefined`
Add an option to return each row as an object keyed by column name (#25) * Add an option to return each row as an object keyed by column name * rename option to rowFormat and address feedback 2024-08-13 16:15:59 +00:00			`* @param {string} [options.rowFormat] desired format of each row passed to the onComplete function`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`* @param {number} [options.rowStart] first requested row index (inclusive)`
			`* @param {number} [options.rowEnd] last requested row index (exclusive)`
decompressPage for dictionary and data page v1 only 2024-02-24 19:55:04 +00:00			`* @param {Function} [options.onChunk] called when a column chunk is parsed. chunks may include row data outside the requested range.`
			`* @param {Function} [options.onComplete] called when all requested rows and columns are parsed`
Custom decompressors 2024-02-23 18:25:06 +00:00			`* @param {Compressors} [options.compressor] custom decompressors`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`* @returns {Promise<void>} resolves when all requested rows and columns are parsed`
All javascript, no typescript 2024-01-04 19:11:00 +00:00			`*/`
Fix typescript definitions 2024-04-26 19:52:42 +00:00			`export function parquetRead(options: ParquetReadOptions): Promise<void>`
All javascript, no typescript 2024-01-04 19:11:00 +00:00
Promisified parquetReadObjects function 2024-08-20 18:30:39 +00:00			`/**`
			`* Read parquet data and return a Promise of object-oriented row data.`
			`*`
			`* @param {object} options read options`
			`* @param {AsyncBuffer} options.file file-like object containing parquet data`
			`* @param {FileMetaData} [options.metadata] parquet file metadata`
			`* @param {string[]} [options.columns] columns to read, all columns if undefined`
			`* @param {number} [options.rowStart] first requested row index (inclusive)`
			`* @param {number} [options.rowEnd] last requested row index (exclusive)`
			`* @param {Compressors} [options.compressor] custom decompressors`
			`* @returns {Promise<void>} resolves when all requested rows and columns are parsed`
			`*/`
			`export function parquetReadObjects(options: ParquetReadOptions): Promise<Array<Record<string, any>>>`

Query api 2024-09-15 04:12:30 +00:00			`/**`
			`* Wraps parquetRead with orderBy support.`
			`* This is a parquet-aware query engine that can read a subset of rows and columns.`
			`* Accepts an optional orderBy column name to sort the results.`
			`* Note that using orderBy may SIGNIFICANTLY increase the query time.`
			`*`
			`* @param {ParquetReadOptions & { orderBy?: string }} options`
			`* @returns {Promise<Record<string, any>[]>} resolves when all requested rows and columns are parsed`
			`*/`
			`export function parquetQuery(options: ParquetReadOptions & { orderBy?: string }): Promise<Array<Record<string, any>>>`

Async metadata fetching 2024-01-15 19:10:26 +00:00			`/**`
			`* Read parquet metadata from an async buffer.`
			`*`
			`* An AsyncBuffer is like an ArrayBuffer, but the slices are loaded`
			`* asynchronously, possibly over the network.`
			`*`
Check for magic number before reading metadata length. Also make sure that metadata length is available. 2024-02-02 08:06:37 +00:00			`* You must provide the byteLength of the buffer, typically from a HEAD request.`
			`*`
			`* In theory, you could use suffix-range requests to fetch the end of the file,`
			`* and save a round trip. But in practice, this doesn't work because chrome`
			`* deems suffix-range requests as a not-safe-listed header, and will require`
			`* a pre-flight. So the byteLength is required.`
			`*`
Async metadata fetching 2024-01-15 19:10:26 +00:00			`* To make this efficient, we initially request the last 512kb of the file,`
			`* which is likely to contain the metadata. If the metadata length exceeds the`
			`* initial fetch, 512kb, we request the rest of the metadata from the AsyncBuffer.`
			`*`
			`* This ensures that we either make one 512kb initial request for the metadata,`
Check for magic number before reading metadata length. Also make sure that metadata length is available. 2024-02-02 08:06:37 +00:00			`* or a second request for up to the metadata size.`
Async metadata fetching 2024-01-15 19:10:26 +00:00			`*`
			`* @param {AsyncBuffer} asyncBuffer parquet file contents`
			`* @param {number} initialFetchSize initial fetch size in bytes (default 512kb)`
Export parquetSchema tree 2024-01-20 02:51:16 +00:00			`* @returns {Promise<FileMetaData>} parquet metadata object`
Async metadata fetching 2024-01-15 19:10:26 +00:00			`*/`
Fix typescript definitions 2024-04-26 19:52:42 +00:00			`export function parquetMetadataAsync(asyncBuffer: AsyncBuffer, initialFetchSize?: number): Promise<FileMetaData>`
All javascript, no typescript 2024-01-04 19:11:00 +00:00
			`/**`
Async metadata fetching 2024-01-15 19:10:26 +00:00			`* Read parquet metadata from a buffer`
All javascript, no typescript 2024-01-04 19:11:00 +00:00			`*`
			`* @param {ArrayBuffer} arrayBuffer parquet file contents`
Export parquetSchema tree 2024-01-20 02:51:16 +00:00			`* @returns {FileMetaData} parquet metadata object`
All javascript, no typescript 2024-01-04 19:11:00 +00:00			`*/`
Parquet schema utils 2024-01-07 04:27:18 +00:00			`export function parquetMetadata(arrayBuffer: ArrayBuffer): FileMetaData`
Export snappyUncompress 2024-01-04 19:45:37 +00:00
Export parquetSchema tree 2024-01-20 02:51:16 +00:00			`/**`
			`* Return a tree of schema elements from parquet metadata.`
			`*`
			`* @param {FileMetaData} metadata parquet metadata object`
			`* @returns {SchemaTree} tree of schema elements`
			`*/`
Fix parquetSchema type 2024-01-20 17:44:13 +00:00			`export function parquetSchema(metadata: FileMetaData): SchemaTree`
Export parquetSchema tree 2024-01-20 02:51:16 +00:00
Export snappyUncompress 2024-01-04 19:45:37 +00:00			`/**`
			`* Decompress snappy data.`
			`* Accepts an output buffer to avoid allocating a new buffer for each call.`
			`*`
decompressPage for dictionary and data page v1 only 2024-02-24 19:55:04 +00:00			`* @param {Uint8Array} input compressed data`
			`* @param {Uint8Array} output output buffer`
Export snappyUncompress 2024-01-04 19:45:37 +00:00			`* @returns {boolean} true if successful`
			`*/`
decompressPage for dictionary and data page v1 only 2024-02-24 19:55:04 +00:00			`export function snappyUncompress(input: Uint8Array, output: Uint8Array): boolean`
Export toJson 2024-01-05 09:39:59 +00:00
			`/**`
			`* Replace bigints with numbers.`
			`* When parsing parquet files, bigints are used to represent 64-bit integers.`
			`* However, JSON does not support bigints, so it's helpful to convert to numbers.`
			`*`
			`* @param {any} obj object to convert`
			`* @returns {unknown} converted object`
			`*/`
Demo: move to folder, typecheck, and render column indices 2024-06-01 02:40:44 +00:00			`export function toJson(obj: any): any`
Async parquetRead with options 2024-01-15 19:08:48 +00:00
Export asyncBufferFromFile, asyncBufferFromUrl and add to README 2024-07-27 00:02:45 +00:00			`/**`
			`* Construct an AsyncBuffer for a URL.`
Fix util export types 2024-09-26 17:34:26 +00:00			`* If byteLength is not provided, will make a HEAD request to get the file size.`
Export asyncBufferFromFile, asyncBufferFromUrl and add to README 2024-07-27 00:02:45 +00:00			`*/`
Fix util export types 2024-09-26 17:34:26 +00:00			`export function asyncBufferFromUrl(url: string, byteLength?: number): Promise<AsyncBuffer>`
Export asyncBufferFromFile, asyncBufferFromUrl and add to README 2024-07-27 00:02:45 +00:00
			`/**`
			`* Construct an AsyncBuffer for a local file using node fs package.`
			`*/`
			`export function asyncBufferFromFile(filename: string): Promise<AsyncBuffer>`

Fix util export types 2024-09-26 17:34:26 +00:00			`/**`
			`* Get the byte length of a URL using a HEAD request.`
			`*/`
			`export function byteLengthFromUrl(url: string): Promise<number>`

Export cachedAsyncBuffer 2024-10-16 08:09:18 +00:00			`/**`
			`* Returns a cached layer on top of an AsyncBuffer.`
			`*/`
			`export function cachedAsyncBuffer(asyncBuffer: AsyncBuffer): AsyncBuffer`

Async parquetRead with options 2024-01-15 19:08:48 +00:00			`/**`
			`* Parquet query options for reading data`
			`*/`
			`export interface ParquetReadOptions {`
			`file: AsyncBuffer // file-like object containing parquet data`
			`metadata?: FileMetaData // parquet metadata, will be parsed if not provided`
Column filter by name 2024-03-14 22:39:00 +00:00			`columns?: string[] // columns to read, all columns if undefined`
Add an option to return each row as an object keyed by column name (#25) * Add an option to return each row as an object keyed by column name * rename option to rowFormat and address feedback 2024-08-13 16:15:59 +00:00			`rowFormat?: string // format of each row passed to the onComplete function`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`rowStart?: number // inclusive`
			`rowEnd?: number // exclusive`
			`onChunk?: (chunk: ColumnData) => void // called when a column chunk is parsed. chunks may be outside the requested range.`
Revert onComplete type signature change from #25 The type change caused a lot of downstream type errors. If you pass rowFormat: 'object' then it will return Record<string, any>[] instead of any[][]. This means the types are not aligned with behavior. Will figure out how to fix it later, for now don't want break downstream projects. 2024-08-15 04:51:33 +00:00			`onComplete?: (rows: any[][]) => void // called when all requested rows and columns are parsed`
Custom decompressors 2024-02-23 18:25:06 +00:00			`compressors?: Compressors // custom decompressors`
Convert byte arrays to utf8 by default 2024-05-23 05:24:54 +00:00			`utf8?: boolean // decode byte arrays as utf8 strings (default true)`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`}`

			`/**`
			`* A run of column data`
			`*/`
			`export interface ColumnData {`
Column filter by name 2024-03-14 22:39:00 +00:00			`columnName: string`
			`columnData: ArrayLike<any>`
Async parquetRead with options 2024-01-15 19:08:48 +00:00			`rowStart: number`
			`rowEnd: number`
			`}`