hyparquet/src/types.d.ts
2024-01-20 18:30:10 -08:00

218 lines
4.1 KiB
TypeScript

/**
* File-like object that can read slices of a file asynchronously.
*/
export interface AsyncBuffer {
byteLength: number
slice(start: number, end?: number): Promise<ArrayBuffer>
}
/**
* Just like an ArrayBuffer, but an interface
*/
export interface ArrayBufferLike {
byteLength: number
slice(start: number, end?: number): ArrayBuffer
}
/**
* Represents a decoded value, and includes the number of bytes read.
* This is used to read data from the file and advance a virtual file pointer.
*/
export interface Decoded<T> {
value: T
byteLength: number
}
// Parquet file metadata types
export interface FileMetaData {
version: number
schema: SchemaElement[]
num_rows: number
row_groups: RowGroup[]
key_value_metadata?: KeyValue[]
created_by?: string
metadata_length: number
}
export interface SchemaTree {
element: SchemaElement
children: SchemaTree[]
count: number
}
export interface SchemaElement {
type?: ParquetType
type_length?: number
repetition_type?: FieldRepetitionType
name: string
num_children?: number
converted_type?: ConvertedType
scale?: number
precision?: number
field_id?: number
}
export enum ParquetType {
BOOLEAN = 0,
INT32 = 1,
INT64 = 2,
INT96 = 3, // deprecated
FLOAT = 4,
DOUBLE = 5,
BYTE_ARRAY = 6,
FIXED_LEN_BYTE_ARRAY = 7,
}
export enum FieldRepetitionType {
REQUIRED = 0,
OPTIONAL = 1,
REPEATED = 2,
}
export enum ConvertedType {
UTF8 = 0,
MAP = 1,
MAP_KEY_VALUE = 2,
LIST = 3,
ENUM = 4,
DECIMAL = 5,
DATE = 6,
TIME_MILLIS = 7,
TIME_MICROS = 8,
TIMESTAMP_MILLIS = 9,
TIMESTAMP_MICROS = 10,
UINT_8 = 11,
UINT_16 = 12,
UINT_32 = 13,
UINT_64 = 14,
INT_8 = 15,
INT_16 = 16,
INT_32 = 17,
INT_64 = 18,
JSON = 19,
BSON = 20,
INTERVAL = 21,
}
export interface RowGroup {
columns: ColumnChunk[]
total_byte_size: number
num_rows: number
sorting_columns?: SortingColumn[]
}
export interface ColumnChunk {
file_path?: string
file_offset: number
meta_data?: ColumnMetaData
}
export interface ColumnMetaData {
type: ParquetType
encodings: Encoding[]
path_in_schema: string[]
codec: CompressionCodec
num_values: number
total_uncompressed_size: number
total_compressed_size: number
key_value_metadata?: KeyValue[]
data_page_offset: number
index_page_offset?: number
dictionary_page_offset?: number
statistics?: Statistics
encoding_stats?: PageEncodingStats[]
}
export enum Encoding {
PLAIN = 0,
PLAIN_DICTIONARY = 2,
RLE = 3,
BIT_PACKED = 4, // deprecated
DELTA_BINARY_PACKED = 5,
DELTA_LENGTH_BYTE_ARRAY = 6,
DELTA_BYTE_ARRAY = 7,
RLE_DICTIONARY = 8,
BYTE_STREAM_SPLIT = 9,
}
export enum CompressionCodec {
UNCOMPRESSED = 0,
SNAPPY = 1,
GZIP = 2,
LZO = 3,
BROTLI = 4,
LZ4 = 5,
ZSTD = 6,
LZ4_RAW = 7,
}
interface KeyValue {
key: string
value?: string
}
export interface Statistics {
max?: Uint8Array // binary representation
min?: Uint8Array // binary representation
null_count?: number
distinct_count?: number
}
interface PageEncodingStats {
page_type: PageType
encoding: Encoding
count: number
}
export enum PageType {
DATA_PAGE = 0,
INDEX_PAGE = 1,
DICTIONARY_PAGE = 2,
DATA_PAGE_V2 = 3,
}
interface SortingColumn {
column_idx: number
descending: boolean
nulls_first: boolean
}
// Parquet file header types
export interface PageHeader {
type: PageType
uncompressed_page_size: number
compressed_page_size: number
crc?: number
data_page_header?: DataPageHeader
index_page_header?: IndexPageHeader
dictionary_page_header?: DictionaryPageHeader
data_page_header_v2?: DataPageHeaderV2
}
export interface DataPageHeader {
num_values: number
encoding: Encoding
definition_level_encoding: Encoding
repetition_level_encoding: Encoding
statistics?: Statistics
}
interface IndexPageHeader {}
export interface DictionaryPageHeader {
num_values: number
encoding: Encoding
is_sorted?: boolean
}
interface DataPageHeaderV2 {
num_values: number
num_nulls: number
num_rows: number
encoding: Encoding
definition_levels_byte_length: number
repetition_levels_byte_length: number
is_compressed?: boolean
statistics?: Statistics
}