mirror of
https://github.com/asadbek064/hyparquet.git
synced 2025-12-26 15:16:38 +00:00
304 lines
5.9 KiB
TypeScript
304 lines
5.9 KiB
TypeScript
export type Awaitable<T> = T | Promise<T>
|
|
|
|
/**
|
|
* File-like object that can read slices of a file asynchronously.
|
|
*/
|
|
export interface AsyncBuffer {
|
|
byteLength: number
|
|
slice(start: number, end?: number): Awaitable<ArrayBuffer>
|
|
}
|
|
|
|
export interface DataReader {
|
|
view: DataView
|
|
offset: number
|
|
}
|
|
|
|
// Parquet file metadata types
|
|
export interface FileMetaData {
|
|
version: number
|
|
schema: SchemaElement[]
|
|
num_rows: bigint
|
|
row_groups: RowGroup[]
|
|
key_value_metadata?: KeyValue[]
|
|
created_by?: string
|
|
// column_orders?: ColumnOrder[]
|
|
// encryption_algorithm?: EncryptionAlgorithm
|
|
// footer_signing_key_metadata?: Uint8Array
|
|
metadata_length: number
|
|
}
|
|
|
|
export interface SchemaTree {
|
|
children: SchemaTree[]
|
|
count: number
|
|
element: SchemaElement
|
|
path: string[]
|
|
}
|
|
|
|
export interface SchemaElement {
|
|
type?: ParquetType
|
|
type_length?: number
|
|
repetition_type?: FieldRepetitionType
|
|
name: string
|
|
num_children?: number
|
|
converted_type?: ConvertedType
|
|
scale?: number
|
|
precision?: number
|
|
field_id?: number
|
|
logical_type?: LogicalType
|
|
}
|
|
|
|
export type ParquetType =
|
|
'BOOLEAN' |
|
|
'INT32' |
|
|
'INT64' |
|
|
'INT96' | // deprecated
|
|
'FLOAT' |
|
|
'DOUBLE' |
|
|
'BYTE_ARRAY' |
|
|
'FIXED_LEN_BYTE_ARRAY'
|
|
|
|
export type FieldRepetitionType =
|
|
'REQUIRED' |
|
|
'OPTIONAL' |
|
|
'REPEATED'
|
|
|
|
export type ConvertedType =
|
|
'UTF8' |
|
|
'MAP' |
|
|
'MAP_KEY_VALUE' |
|
|
'LIST' |
|
|
'ENUM' |
|
|
'DECIMAL' |
|
|
'DATE' |
|
|
'TIME_MILLIS' |
|
|
'TIME_MICROS' |
|
|
'TIMESTAMP_MILLIS' |
|
|
'TIMESTAMP_MICROS' |
|
|
'UINT_8' |
|
|
'UINT_16' |
|
|
'UINT_32' |
|
|
'UINT_64' |
|
|
'INT_8' |
|
|
'INT_16' |
|
|
'INT_32' |
|
|
'INT_64' |
|
|
'JSON' |
|
|
'BSON' |
|
|
'INTERVAL'
|
|
|
|
type LogicalDecimalType = {
|
|
type: 'DECIMAL'
|
|
precision: number
|
|
scale: number
|
|
}
|
|
|
|
export type TimeUnit = 'MILLIS' | 'MICROS' | 'NANOS'
|
|
|
|
type LogicalTimeType = {
|
|
type: 'TIME'
|
|
isAdjustedToUTC: boolean
|
|
unit: TimeUnit
|
|
}
|
|
|
|
type LogicalTimestampType = {
|
|
type: 'TIMESTAMP'
|
|
isAdjustedToUTC: boolean
|
|
unit: TimeUnit
|
|
}
|
|
|
|
type LogicalIntType = {
|
|
type: 'INTEGER'
|
|
bitWidth: number
|
|
isSigned: boolean
|
|
}
|
|
|
|
export type LogicalType =
|
|
{ type: LogicalTypeSimple } |
|
|
LogicalDecimalType |
|
|
LogicalTimeType |
|
|
LogicalTimestampType |
|
|
LogicalIntType
|
|
|
|
type LogicalTypeSimple =
|
|
'STRING' |
|
|
'MAP' |
|
|
'LIST' |
|
|
'ENUM' |
|
|
'DECIMAL' |
|
|
'DATE' |
|
|
'INTERVAL' |
|
|
'NULL' |
|
|
'JSON' |
|
|
'BSON' |
|
|
'UUID' |
|
|
'FLOAT16'
|
|
|
|
export type LogicalTypeType = LogicalTypeSimple |
|
|
'TIME' | // convertedType TIME_MILLIS or TIME_MICROS
|
|
'TIMESTAMP' | // convertedType TIMESTAMP_MILLIS or TIMESTAMP_MICROS
|
|
'INTEGER' // convertedType INT or UINT
|
|
|
|
export interface RowGroup {
|
|
columns: ColumnChunk[]
|
|
total_byte_size: bigint
|
|
num_rows: bigint
|
|
sorting_columns?: SortingColumn[]
|
|
file_offset?: bigint
|
|
total_compressed_size?: bigint
|
|
ordinal?: number
|
|
}
|
|
|
|
export interface ColumnChunk {
|
|
file_path?: string
|
|
file_offset: bigint
|
|
meta_data?: ColumnMetaData
|
|
offset_index_offset?: bigint
|
|
offset_index_length?: number
|
|
column_index_offset?: bigint
|
|
column_index_length?: number
|
|
crypto_metadata?: ColumnCryptoMetaData
|
|
encrypted_column_metadata?: Uint8Array
|
|
}
|
|
|
|
export interface ColumnMetaData {
|
|
type: ParquetType
|
|
encodings: Encoding[]
|
|
path_in_schema: string[]
|
|
codec: CompressionCodec
|
|
num_values: bigint
|
|
total_uncompressed_size: bigint
|
|
total_compressed_size: bigint
|
|
key_value_metadata?: KeyValue[]
|
|
data_page_offset: bigint
|
|
index_page_offset?: bigint
|
|
dictionary_page_offset?: bigint
|
|
statistics?: Statistics
|
|
encoding_stats?: PageEncodingStats[]
|
|
bloom_filter_offset?: bigint
|
|
bloom_filter_length?: number
|
|
size_statistics?: SizeStatistics
|
|
}
|
|
|
|
interface ColumnCryptoMetaData {}
|
|
|
|
export type Encoding =
|
|
'PLAIN' |
|
|
'PLAIN_DICTIONARY' |
|
|
'RLE' |
|
|
'BIT_PACKED' | // deprecated
|
|
'DELTA_BINARY_PACKED' |
|
|
'DELTA_LENGTH_BYTE_ARRAY' |
|
|
'DELTA_BYTE_ARRAY' |
|
|
'RLE_DICTIONARY' |
|
|
'BYTE_STREAM_SPLIT'
|
|
|
|
export type CompressionCodec =
|
|
'UNCOMPRESSED' |
|
|
'SNAPPY' |
|
|
'GZIP' |
|
|
'LZO' |
|
|
'BROTLI' |
|
|
'LZ4' |
|
|
'ZSTD' |
|
|
'LZ4_RAW'
|
|
|
|
export type Compressors = {
|
|
[K in CompressionCodec]?: (input: Uint8Array, outputLength: number) => Uint8Array
|
|
}
|
|
|
|
interface KeyValue {
|
|
key: string
|
|
value?: string
|
|
}
|
|
|
|
type MinMaxType = bigint | boolean | number | string
|
|
|
|
export interface Statistics {
|
|
max?: MinMaxType
|
|
min?: MinMaxType
|
|
null_count?: bigint
|
|
distinct_count?: bigint
|
|
max_value?: string
|
|
min_value?: string
|
|
is_max_value_exact?: boolean
|
|
is_min_value_exact?: boolean
|
|
}
|
|
|
|
interface SizeStatistics {
|
|
unencoded_byte_array_data_bytes?: bigint
|
|
repetition_level_histogram?: bigint[]
|
|
definition_level_histogram?: bigint[]
|
|
}
|
|
|
|
interface PageEncodingStats {
|
|
page_type: PageType
|
|
encoding: Encoding
|
|
count: number
|
|
}
|
|
|
|
export type PageType =
|
|
'DATA_PAGE' |
|
|
'INDEX_PAGE' |
|
|
'DICTIONARY_PAGE' |
|
|
'DATA_PAGE_V2'
|
|
|
|
interface SortingColumn {
|
|
column_idx: number
|
|
descending: boolean
|
|
nulls_first: boolean
|
|
}
|
|
|
|
// Parquet file header types
|
|
export interface PageHeader {
|
|
type: PageType
|
|
uncompressed_page_size: number
|
|
compressed_page_size: number
|
|
crc?: number
|
|
data_page_header?: DataPageHeader
|
|
index_page_header?: IndexPageHeader
|
|
dictionary_page_header?: DictionaryPageHeader
|
|
data_page_header_v2?: DataPageHeaderV2
|
|
}
|
|
|
|
export interface DataPageHeader {
|
|
num_values: number
|
|
encoding: Encoding
|
|
definition_level_encoding: Encoding
|
|
repetition_level_encoding: Encoding
|
|
statistics?: Statistics
|
|
}
|
|
|
|
interface IndexPageHeader {}
|
|
|
|
export interface DictionaryPageHeader {
|
|
num_values: number
|
|
encoding: Encoding
|
|
is_sorted?: boolean
|
|
}
|
|
|
|
interface DataPageHeaderV2 {
|
|
num_values: number
|
|
num_nulls: number
|
|
num_rows: number
|
|
encoding: Encoding
|
|
definition_levels_byte_length: number
|
|
repetition_levels_byte_length: number
|
|
is_compressed?: boolean
|
|
statistics?: Statistics
|
|
}
|
|
|
|
interface DataPage {
|
|
definitionLevels: number[] | undefined
|
|
repetitionLevels: number[]
|
|
dataPage: DecodedArray
|
|
}
|
|
|
|
export type DecodedArray =
|
|
Uint8Array |
|
|
Int32Array |
|
|
BigInt64Array |
|
|
BigUint64Array |
|
|
Float32Array |
|
|
Float64Array |
|
|
any[]
|