diff --git a/README.md b/README.md index bfee3c1..5acf45e 100644 --- a/README.md +++ b/README.md @@ -78,6 +78,10 @@ const data = await parquetReadObjects({ }) ``` +## Parquet Writing + +To create parquet files from javascript, check out the [hyparquet-writer](https://github.com/hyparam/hyparquet-writer) package. + ## Advanced Usage ### Reading Metadata @@ -180,10 +184,6 @@ await parquetRead({ The `parquetReadObjects` function defaults to returning an array of objects. -## Parquet Writing - -To create parquet files from javascript, check out the [hyparquet-writer](https://github.com/hyparam/hyparquet-writer) package. - ## Supported Parquet Files The parquet format is known to be a sprawling format which includes options for a wide array of compression schemes, encoding types, and data structures. diff --git a/src/types.d.ts b/src/types.d.ts index f423fb7..60c8cee 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -1,4 +1,48 @@ -export type Awaitable = T | Promise +/** + * Parquet query options for reading data + */ +export interface ParquetReadOptions { + file: AsyncBuffer // file-like object containing parquet data + metadata?: FileMetaData // parquet metadata, will be parsed if not provided + columns?: string[] // columns to read, all columns if undefined + rowFormat?: string // format of each row passed to the onComplete function + rowStart?: number // first requested row index (inclusive) + rowEnd?: number // last requested row index (exclusive) + onChunk?: (chunk: ColumnData) => void // called when a column chunk is parsed. chunks may contain data outside the requested range. + onComplete?: (rows: any[][]) => void // called when all requested rows and columns are parsed + compressors?: Compressors // custom decompressors + utf8?: boolean // decode byte arrays as utf8 strings (default true) +} + +/** + * Parquet query options for filtering data + */ +export interface ParquetQueryFilter { + [key: string]: ParquetQueryValue | ParquetQueryOperator | ParquetQueryFilter[] | undefined + $and?: ParquetQueryFilter[] + $or?: ParquetQueryFilter[] + $not?: ParquetQueryFilter +} +export type ParquetQueryValue = string | number | boolean | object | null | undefined +export type ParquetQueryOperator = { + $gt?: ParquetQueryValue + $gte?: ParquetQueryValue + $lt?: ParquetQueryValue + $lte?: ParquetQueryValue + $ne?: ParquetQueryValue + $in?: ParquetQueryValue[] + $nin?: ParquetQueryValue[] +} + +/** + * A run of column data + */ +export interface ColumnData { + columnName: string + columnData: DecodedArray + rowStart: number + rowEnd: number +} /** * File-like object that can read slices of a file asynchronously. @@ -7,6 +51,7 @@ export interface AsyncBuffer { byteLength: number slice(start: number, end?: number): Awaitable } +export type Awaitable = T | Promise export interface DataReader { view: DataView @@ -329,50 +374,5 @@ export interface ColumnIndex { export type BoundaryOrder = 'UNORDERED' | 'ASCENDING' | 'DESCENDING' -/** - * A run of column data - */ -export interface ColumnData { - columnName: string - columnData: ArrayLike - rowStart: number - rowEnd: number -} - -/** - * Parquet query options for reading data - */ -export interface ParquetReadOptions { - file: AsyncBuffer // file-like object containing parquet data - metadata?: FileMetaData // parquet metadata, will be parsed if not provided - columns?: string[] // columns to read, all columns if undefined - rowFormat?: string // format of each row passed to the onComplete function - rowStart?: number // first requested row index (inclusive) - rowEnd?: number // last requested row index (exclusive) - onChunk?: (chunk: ColumnData) => void // called when a column chunk is parsed. chunks may be outside the requested range. - onComplete?: (rows: any[][]) => void // called when all requested rows and columns are parsed - compressors?: Compressors // custom decompressors - utf8?: boolean // decode byte arrays as utf8 strings (default true) -} - -export type ParquetQueryValue = string | number | boolean | object | null | undefined - -export type ParquetQueryOperator = { - $gt?: ParquetQueryValue - $gte?: ParquetQueryValue - $lt?: ParquetQueryValue - $lte?: ParquetQueryValue - $ne?: ParquetQueryValue - $in?: ParquetQueryValue[] - $nin?: ParquetQueryValue[] -} - -export interface ParquetQueryFilter { - [key: string]: ParquetQueryValue | ParquetQueryOperator | ParquetQueryFilter[] | undefined - $and?: ParquetQueryFilter[] - $or?: ParquetQueryFilter[] - $not?: ParquetQueryFilter -} - export type ThriftObject = { [ key: `field_${number}` ]: ThriftType } export type ThriftType = boolean | number | bigint | Uint8Array | ThriftType[] | ThriftObject diff --git a/test/column.test.js b/test/column.test.js index 62967a4..b5bb3cd 100644 --- a/test/column.test.js +++ b/test/column.test.js @@ -1,4 +1,3 @@ -import { compressors } from 'hyparquet-compressors' import { describe, expect, it } from 'vitest' import { getColumnRange, readColumn } from '../src/column.js' import { parquetMetadata } from '../src/hyparquet.js' @@ -14,8 +13,8 @@ describe('readColumn', () => { { rowGroupEnd: 0, expected: [] }, ])('readColumn with rowGroupEnd %p', async ({ rowGroupEnd, expected }) => { const testFile = 'test/files/float16_nonzeros_and_nans.parquet' - const asyncBuffer = await asyncBufferFromFile(testFile) - const arrayBuffer = await asyncBuffer.slice(0) + const file = await asyncBufferFromFile(testFile) + const arrayBuffer = await file.slice(0) const metadata = parquetMetadata(arrayBuffer) const column = metadata.row_groups[0].columns[0] @@ -25,14 +24,14 @@ describe('readColumn', () => { const schemaPath = getSchemaPath(metadata.schema, column.meta_data?.path_in_schema ?? []) const reader = { view: new DataView(columnArrayBuffer), offset: 0 } - const result = readColumn(reader, 0, rowGroupEnd, column.meta_data, schemaPath, { file: asyncBuffer, compressors }) + const result = readColumn(reader, 0, rowGroupEnd, column.meta_data, schemaPath, { file }) expect(result).toEqual(expected) }) it('readColumn should return a typed array', async () => { const testFile = 'test/files/datapage_v2.snappy.parquet' - const asyncBuffer = await asyncBufferFromFile(testFile) - const arrayBuffer = await asyncBuffer.slice(0) + const file = await asyncBufferFromFile(testFile) + const arrayBuffer = await file.slice(0) const metadata = parquetMetadata(arrayBuffer) const column = metadata.row_groups[0].columns[1] // second column @@ -42,7 +41,7 @@ describe('readColumn', () => { const schemaPath = getSchemaPath(metadata.schema, column.meta_data?.path_in_schema ?? []) const reader = { view: new DataView(columnArrayBuffer), offset: 0 } - const columnData = readColumn(reader, 0, Infinity, column.meta_data, schemaPath, { file: asyncBuffer, compressors }) + const columnData = readColumn(reader, 0, Infinity, column.meta_data, schemaPath, { file }) expect(columnData[0]).toBeInstanceOf(Int32Array) }) })