2024-01-14 21:38:05 +00:00
|
|
|
/**
|
|
|
|
|
* Just like an ArrayBuffer, but an interface
|
|
|
|
|
*/
|
|
|
|
|
export interface ArrayBufferLike {
|
|
|
|
|
byteLength: number
|
|
|
|
|
slice(start: number, end?: number): ArrayBuffer
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-03 17:56:17 +00:00
|
|
|
/**
|
|
|
|
|
* Represents a decoded value, and includes the number of bytes read.
|
|
|
|
|
* This is used to read data from the file and advance a virtual file pointer.
|
|
|
|
|
*/
|
|
|
|
|
export interface Decoded<T> {
|
|
|
|
|
value: T
|
|
|
|
|
byteLength: number
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Parquet file metadata types
|
|
|
|
|
export interface FileMetaData {
|
|
|
|
|
version: number
|
|
|
|
|
schema: SchemaElement[]
|
|
|
|
|
num_rows: number
|
|
|
|
|
row_groups: RowGroup[]
|
|
|
|
|
key_value_metadata?: KeyValue[]
|
|
|
|
|
created_by?: string
|
2024-01-12 22:35:20 +00:00
|
|
|
metadata_length: number
|
2024-01-03 17:56:17 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export interface SchemaElement {
|
|
|
|
|
type?: ParquetType
|
|
|
|
|
type_length?: number
|
|
|
|
|
repetition_type?: FieldRepetitionType
|
|
|
|
|
name: string
|
|
|
|
|
num_children?: number
|
|
|
|
|
converted_type?: ConvertedType
|
|
|
|
|
scale?: number
|
|
|
|
|
precision?: number
|
|
|
|
|
field_id?: number
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export enum ParquetType {
|
|
|
|
|
BOOLEAN = 0,
|
|
|
|
|
INT32 = 1,
|
|
|
|
|
INT64 = 2,
|
2024-01-05 09:39:59 +00:00
|
|
|
INT96 = 3, // deprecated
|
2024-01-03 17:56:17 +00:00
|
|
|
FLOAT = 4,
|
|
|
|
|
DOUBLE = 5,
|
|
|
|
|
BYTE_ARRAY = 6,
|
|
|
|
|
FIXED_LEN_BYTE_ARRAY = 7,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export enum FieldRepetitionType {
|
|
|
|
|
REQUIRED = 0,
|
|
|
|
|
OPTIONAL = 1,
|
|
|
|
|
REPEATED = 2,
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-04 17:46:55 +00:00
|
|
|
export enum ConvertedType {
|
2024-01-03 17:56:17 +00:00
|
|
|
UTF8 = 0,
|
|
|
|
|
MAP = 1,
|
|
|
|
|
MAP_KEY_VALUE = 2,
|
|
|
|
|
LIST = 3,
|
|
|
|
|
ENUM = 4,
|
|
|
|
|
DECIMAL = 5,
|
|
|
|
|
DATE = 6,
|
|
|
|
|
TIME_MILLIS = 7,
|
|
|
|
|
TIME_MICROS = 8,
|
|
|
|
|
TIMESTAMP_MILLIS = 9,
|
|
|
|
|
TIMESTAMP_MICROS = 10,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export interface RowGroup {
|
|
|
|
|
columns: ColumnChunk[]
|
|
|
|
|
total_byte_size: number
|
|
|
|
|
num_rows: number
|
|
|
|
|
sorting_columns?: SortingColumn[]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export interface ColumnChunk {
|
|
|
|
|
file_path?: string
|
|
|
|
|
file_offset: number
|
|
|
|
|
meta_data?: ColumnMetaData
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export interface ColumnMetaData {
|
|
|
|
|
type: ParquetType
|
|
|
|
|
encodings: Encoding[]
|
|
|
|
|
path_in_schema: string[]
|
|
|
|
|
codec: CompressionCodec
|
|
|
|
|
num_values: number
|
|
|
|
|
total_uncompressed_size: number
|
|
|
|
|
total_compressed_size: number
|
|
|
|
|
key_value_metadata?: KeyValue[]
|
|
|
|
|
data_page_offset: number
|
|
|
|
|
index_page_offset?: number
|
|
|
|
|
dictionary_page_offset?: number
|
|
|
|
|
statistics?: Statistics
|
|
|
|
|
encoding_stats?: PageEncodingStats[]
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export enum Encoding {
|
|
|
|
|
PLAIN = 0,
|
|
|
|
|
PLAIN_DICTIONARY = 2,
|
|
|
|
|
RLE = 3,
|
2024-01-05 09:39:59 +00:00
|
|
|
BIT_PACKED = 4, // deprecated
|
2024-01-03 17:56:17 +00:00
|
|
|
DELTA_BINARY_PACKED = 5,
|
|
|
|
|
DELTA_LENGTH_BYTE_ARRAY = 6,
|
|
|
|
|
DELTA_BYTE_ARRAY = 7,
|
|
|
|
|
RLE_DICTIONARY = 8,
|
|
|
|
|
BYTE_STREAM_SPLIT = 9,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export enum CompressionCodec {
|
|
|
|
|
UNCOMPRESSED = 0,
|
|
|
|
|
SNAPPY = 1,
|
|
|
|
|
GZIP = 2,
|
|
|
|
|
LZO = 3,
|
|
|
|
|
BROTLI = 4,
|
|
|
|
|
LZ4 = 5,
|
|
|
|
|
ZSTD = 6,
|
|
|
|
|
LZ4_RAW = 7,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
interface KeyValue {
|
|
|
|
|
key: string
|
|
|
|
|
value?: string
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export interface Statistics {
|
|
|
|
|
max?: Uint8Array // binary representation
|
|
|
|
|
min?: Uint8Array // binary representation
|
|
|
|
|
null_count?: number
|
|
|
|
|
distinct_count?: number
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
interface PageEncodingStats {
|
|
|
|
|
page_type: PageType
|
|
|
|
|
encoding: Encoding
|
|
|
|
|
count: number
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export enum PageType {
|
|
|
|
|
DATA_PAGE = 0,
|
|
|
|
|
INDEX_PAGE = 1,
|
|
|
|
|
DICTIONARY_PAGE = 2,
|
|
|
|
|
DATA_PAGE_V2 = 3,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
interface SortingColumn {
|
|
|
|
|
column_idx: number
|
|
|
|
|
descending: boolean
|
|
|
|
|
nulls_first: boolean
|
|
|
|
|
}
|
2024-01-05 11:06:27 +00:00
|
|
|
|
|
|
|
|
// Parquet file header types
|
|
|
|
|
export interface PageHeader {
|
|
|
|
|
type: PageType
|
|
|
|
|
uncompressed_page_size: number
|
|
|
|
|
compressed_page_size: number
|
|
|
|
|
crc?: number
|
|
|
|
|
data_page_header?: DataPageHeader
|
|
|
|
|
index_page_header?: IndexPageHeader
|
|
|
|
|
dictionary_page_header?: DictionaryPageHeader
|
|
|
|
|
data_page_header_v2?: DataPageHeaderV2
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export interface DataPageHeader {
|
|
|
|
|
num_values: number
|
|
|
|
|
encoding: Encoding
|
|
|
|
|
definition_level_encoding: Encoding
|
|
|
|
|
repetition_level_encoding: Encoding
|
|
|
|
|
statistics?: Statistics
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
interface IndexPageHeader {}
|
|
|
|
|
|
|
|
|
|
export interface DictionaryPageHeader {
|
|
|
|
|
num_values: number
|
|
|
|
|
encoding: Encoding
|
|
|
|
|
is_sorted?: boolean
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
interface DataPageHeaderV2 {
|
|
|
|
|
num_values: number
|
|
|
|
|
num_nulls: number
|
|
|
|
|
num_rows: number
|
|
|
|
|
encoding: Encoding
|
|
|
|
|
definition_levels_byte_length: number
|
|
|
|
|
repetition_levels_byte_length: number
|
|
|
|
|
is_compressed?: boolean
|
|
|
|
|
statistics?: Statistics
|
|
|
|
|
}
|