hyparquet/src/types.d.ts

304 lines
5.9 KiB
TypeScript
Raw Normal View History

2024-05-14 09:19:37 +00:00
export type Awaitable<T> = T | Promise<T>
2024-04-26 19:52:42 +00:00
2024-01-15 19:10:26 +00:00
/**
* File-like object that can read slices of a file asynchronously.
*/
export interface AsyncBuffer {
byteLength: number
2024-04-26 19:52:42 +00:00
slice(start: number, end?: number): Awaitable<ArrayBuffer>
2024-01-15 19:10:26 +00:00
}
export interface DataReader {
view: DataView
offset: number
}
2024-01-03 17:56:17 +00:00
// Parquet file metadata types
export interface FileMetaData {
version: number
schema: SchemaElement[]
num_rows: bigint
2024-01-03 17:56:17 +00:00
row_groups: RowGroup[]
key_value_metadata?: KeyValue[]
created_by?: string
// column_orders?: ColumnOrder[]
// encryption_algorithm?: EncryptionAlgorithm
// footer_signing_key_metadata?: Uint8Array
2024-01-12 22:35:20 +00:00
metadata_length: number
2024-01-03 17:56:17 +00:00
}
2024-01-20 02:51:16 +00:00
export interface SchemaTree {
children: SchemaTree[]
count: number
2024-05-06 20:18:27 +00:00
element: SchemaElement
path: string[]
2024-01-20 02:51:16 +00:00
}
2024-01-03 17:56:17 +00:00
export interface SchemaElement {
type?: ParquetType
type_length?: number
repetition_type?: FieldRepetitionType
name: string
num_children?: number
converted_type?: ConvertedType
scale?: number
precision?: number
field_id?: number
2024-05-13 16:22:55 +00:00
logical_type?: LogicalType
2024-01-03 17:56:17 +00:00
}
2024-02-27 19:06:31 +00:00
export type ParquetType =
'BOOLEAN' |
'INT32' |
'INT64' |
'INT96' | // deprecated
'FLOAT' |
'DOUBLE' |
'BYTE_ARRAY' |
'FIXED_LEN_BYTE_ARRAY'
2024-01-03 17:56:17 +00:00
2024-02-11 22:33:56 +00:00
export type FieldRepetitionType =
'REQUIRED' |
'OPTIONAL' |
'REPEATED'
export type ConvertedType =
'UTF8' |
'MAP' |
'MAP_KEY_VALUE' |
'LIST' |
'ENUM' |
'DECIMAL' |
'DATE' |
'TIME_MILLIS' |
'TIME_MICROS' |
'TIMESTAMP_MILLIS' |
'TIMESTAMP_MICROS' |
'UINT_8' |
'UINT_16' |
'UINT_32' |
'UINT_64' |
'INT_8' |
'INT_16' |
'INT_32' |
'INT_64' |
'JSON' |
'BSON' |
'INTERVAL'
2024-01-03 17:56:17 +00:00
2024-03-12 07:00:20 +00:00
type LogicalDecimalType = {
2024-05-05 21:24:21 +00:00
type: 'DECIMAL'
2024-03-12 07:00:20 +00:00
precision: number
scale: number
}
2024-05-24 22:06:46 +00:00
export type TimeUnit = 'MILLIS' | 'MICROS' | 'NANOS'
2024-05-05 21:24:21 +00:00
type LogicalTimeType = {
type: 'TIME'
isAdjustedToUTC: boolean
unit: TimeUnit
}
type LogicalTimestampType = {
type: 'TIMESTAMP'
isAdjustedToUTC: boolean
unit: TimeUnit
}
2024-03-12 07:00:20 +00:00
type LogicalIntType = {
2024-05-05 21:24:21 +00:00
type: 'INTEGER'
2024-03-12 07:00:20 +00:00
bitWidth: number
isSigned: boolean
}
export type LogicalType =
2024-05-24 23:48:38 +00:00
{ type: LogicalTypeSimple } |
2024-03-12 07:00:20 +00:00
LogicalDecimalType |
2024-05-05 21:24:21 +00:00
LogicalTimeType |
LogicalTimestampType |
2024-03-12 07:00:20 +00:00
LogicalIntType
2024-05-24 23:48:38 +00:00
type LogicalTypeSimple =
'STRING' |
'MAP' |
'LIST' |
'ENUM' |
'DECIMAL' |
'DATE' |
'INTERVAL' |
'NULL' |
'JSON' |
'BSON' |
'UUID' |
'FLOAT16'
export type LogicalTypeType = LogicalTypeSimple |
2024-03-12 07:00:20 +00:00
'TIME' | // convertedType TIME_MILLIS or TIME_MICROS
'TIMESTAMP' | // convertedType TIMESTAMP_MILLIS or TIMESTAMP_MICROS
2024-05-24 23:48:38 +00:00
'INTEGER' // convertedType INT or UINT
2024-03-12 07:00:20 +00:00
2024-01-03 17:56:17 +00:00
export interface RowGroup {
columns: ColumnChunk[]
total_byte_size: bigint
num_rows: bigint
2024-01-03 17:56:17 +00:00
sorting_columns?: SortingColumn[]
file_offset?: bigint
total_compressed_size?: bigint
ordinal?: number
2024-01-03 17:56:17 +00:00
}
export interface ColumnChunk {
file_path?: string
file_offset: bigint
2024-01-03 17:56:17 +00:00
meta_data?: ColumnMetaData
offset_index_offset?: bigint
offset_index_length?: number
column_index_offset?: bigint
column_index_length?: number
crypto_metadata?: ColumnCryptoMetaData
encrypted_column_metadata?: Uint8Array
2024-01-03 17:56:17 +00:00
}
export interface ColumnMetaData {
type: ParquetType
encodings: Encoding[]
path_in_schema: string[]
codec: CompressionCodec
num_values: bigint
total_uncompressed_size: bigint
total_compressed_size: bigint
2024-01-03 17:56:17 +00:00
key_value_metadata?: KeyValue[]
data_page_offset: bigint
index_page_offset?: bigint
dictionary_page_offset?: bigint
2024-01-03 17:56:17 +00:00
statistics?: Statistics
encoding_stats?: PageEncodingStats[]
bloom_filter_offset?: bigint
bloom_filter_length?: number
size_statistics?: SizeStatistics
2024-01-03 17:56:17 +00:00
}
interface ColumnCryptoMetaData {}
2024-02-27 18:33:17 +00:00
export type Encoding =
'PLAIN' |
'PLAIN_DICTIONARY' |
'RLE' |
'BIT_PACKED' | // deprecated
'DELTA_BINARY_PACKED' |
'DELTA_LENGTH_BYTE_ARRAY' |
'DELTA_BYTE_ARRAY' |
'RLE_DICTIONARY' |
'BYTE_STREAM_SPLIT'
2024-01-03 17:56:17 +00:00
2024-02-11 22:33:56 +00:00
export type CompressionCodec =
'UNCOMPRESSED' |
'SNAPPY' |
'GZIP' |
'LZO' |
'BROTLI' |
'LZ4' |
'ZSTD' |
'LZ4_RAW'
2024-01-03 17:56:17 +00:00
2024-02-23 18:25:06 +00:00
export type Compressors = {
[K in CompressionCodec]?: (input: Uint8Array, outputLength: number) => Uint8Array
2024-02-23 18:25:06 +00:00
}
2024-01-03 17:56:17 +00:00
interface KeyValue {
key: string
value?: string
}
type MinMaxType = bigint | boolean | number | string
2024-01-03 17:56:17 +00:00
export interface Statistics {
max?: MinMaxType
min?: MinMaxType
null_count?: bigint
distinct_count?: bigint
max_value?: string
min_value?: string
is_max_value_exact?: boolean
is_min_value_exact?: boolean
}
interface SizeStatistics {
unencoded_byte_array_data_bytes?: bigint
repetition_level_histogram?: bigint[]
definition_level_histogram?: bigint[]
2024-01-03 17:56:17 +00:00
}
interface PageEncodingStats {
page_type: PageType
encoding: Encoding
count: number
}
2024-04-18 07:02:29 +00:00
export type PageType =
'DATA_PAGE' |
'INDEX_PAGE' |
'DICTIONARY_PAGE' |
'DATA_PAGE_V2'
2024-01-03 17:56:17 +00:00
interface SortingColumn {
column_idx: number
descending: boolean
nulls_first: boolean
}
2024-01-05 11:06:27 +00:00
// Parquet file header types
export interface PageHeader {
type: PageType
uncompressed_page_size: number
compressed_page_size: number
crc?: number
data_page_header?: DataPageHeader
index_page_header?: IndexPageHeader
dictionary_page_header?: DictionaryPageHeader
data_page_header_v2?: DataPageHeaderV2
}
export interface DataPageHeader {
num_values: number
encoding: Encoding
definition_level_encoding: Encoding
repetition_level_encoding: Encoding
statistics?: Statistics
}
interface IndexPageHeader {}
export interface DictionaryPageHeader {
num_values: number
encoding: Encoding
is_sorted?: boolean
}
interface DataPageHeaderV2 {
num_values: number
num_nulls: number
num_rows: number
encoding: Encoding
definition_levels_byte_length: number
repetition_levels_byte_length: number
is_compressed?: boolean
statistics?: Statistics
}
2024-02-26 18:32:53 +00:00
interface DataPage {
definitionLevels: number[] | undefined
repetitionLevels: number[]
2024-05-02 06:23:50 +00:00
dataPage: DecodedArray
2024-02-26 18:32:53 +00:00
}
2024-05-02 06:23:50 +00:00
export type DecodedArray =
Uint8Array |
Int32Array |
BigInt64Array |
2024-05-24 06:35:49 +00:00
BigUint64Array |
2024-05-02 06:23:50 +00:00
Float32Array |
Float64Array |
any[]