2025-03-26 04:06:43 +00:00
|
|
|
import { ConvertedType, Encoding, FieldRepetitionType, ParquetType } from 'hyparquet/src/constants.js'
|
2025-03-26 00:49:59 +00:00
|
|
|
import { serializeTCompactProtocol } from './thrift.js'
|
|
|
|
|
|
|
|
|
|
const CompressionCodec = [
|
|
|
|
|
'UNCOMPRESSED',
|
|
|
|
|
'SNAPPY',
|
|
|
|
|
'GZIP',
|
|
|
|
|
'LZO',
|
|
|
|
|
'BROTLI',
|
|
|
|
|
'LZ4',
|
|
|
|
|
'ZSTD',
|
|
|
|
|
'LZ4_RAW',
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @import {FileMetaData} from 'hyparquet'
|
|
|
|
|
* @import {Writer} from './writer.js'
|
|
|
|
|
* @param {Writer} writer
|
|
|
|
|
* @param {FileMetaData} metadata
|
|
|
|
|
*/
|
|
|
|
|
export function writeMetadata(writer, metadata) {
|
|
|
|
|
const compact = {
|
|
|
|
|
field_1: metadata.version,
|
|
|
|
|
field_2: metadata.schema && metadata.schema.map(element => ({
|
|
|
|
|
field_1: element.type && ParquetType.indexOf(element.type),
|
|
|
|
|
field_2: element.type_length,
|
2025-03-26 03:15:14 +00:00
|
|
|
field_3: element.repetition_type && FieldRepetitionType.indexOf(element.repetition_type),
|
2025-03-26 00:49:59 +00:00
|
|
|
field_4: element.name,
|
|
|
|
|
field_5: element.num_children,
|
2025-03-26 04:06:43 +00:00
|
|
|
field_6: element.converted_type && ConvertedType.indexOf(element.converted_type),
|
2025-03-26 00:49:59 +00:00
|
|
|
field_7: element.scale,
|
|
|
|
|
field_8: element.precision,
|
|
|
|
|
field_9: element.field_id,
|
|
|
|
|
field_10: element.logical_type,
|
|
|
|
|
})),
|
|
|
|
|
field_3: metadata.num_rows,
|
|
|
|
|
field_4: metadata.row_groups.map(rg => ({
|
|
|
|
|
field_1: rg.columns.map(c => ({
|
|
|
|
|
field_1: c.file_path,
|
|
|
|
|
field_2: c.file_offset,
|
|
|
|
|
field_3: c.meta_data && {
|
|
|
|
|
field_1: ParquetType.indexOf(c.meta_data.type),
|
2025-03-26 04:06:43 +00:00
|
|
|
field_2: c.meta_data.encodings.map(e => Encoding.indexOf(e)),
|
2025-03-26 00:49:59 +00:00
|
|
|
field_3: c.meta_data.path_in_schema,
|
|
|
|
|
field_4: CompressionCodec.indexOf(c.meta_data.codec),
|
|
|
|
|
field_5: c.meta_data.num_values,
|
|
|
|
|
field_6: c.meta_data.total_uncompressed_size,
|
|
|
|
|
field_7: c.meta_data.total_compressed_size,
|
|
|
|
|
field_8: c.meta_data.key_value_metadata,
|
|
|
|
|
field_9: c.meta_data.data_page_offset,
|
|
|
|
|
field_10: c.meta_data.index_page_offset,
|
|
|
|
|
field_11: c.meta_data.dictionary_page_offset,
|
|
|
|
|
field_12: c.meta_data.statistics,
|
|
|
|
|
field_13: c.meta_data.encoding_stats,
|
|
|
|
|
field_14: c.meta_data.bloom_filter_offset,
|
|
|
|
|
field_15: c.meta_data.bloom_filter_length,
|
|
|
|
|
field_16: c.meta_data.size_statistics,
|
|
|
|
|
},
|
|
|
|
|
field_4: c.offset_index_offset,
|
|
|
|
|
field_5: c.offset_index_length,
|
|
|
|
|
field_6: c.column_index_offset,
|
|
|
|
|
field_7: c.column_index_length,
|
|
|
|
|
field_8: c.crypto_metadata,
|
|
|
|
|
field_9: c.encrypted_column_metadata,
|
|
|
|
|
})),
|
|
|
|
|
field_2: rg.total_byte_size,
|
|
|
|
|
field_3: rg.num_rows,
|
|
|
|
|
field_4: rg.sorting_columns,
|
|
|
|
|
field_5: rg.file_offset,
|
|
|
|
|
field_6: rg.total_compressed_size,
|
|
|
|
|
field_7: rg.ordinal,
|
|
|
|
|
})),
|
|
|
|
|
field_5: metadata.key_value_metadata,
|
|
|
|
|
field_6: metadata.created_by,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const metadataStart = writer.offset
|
|
|
|
|
serializeTCompactProtocol(writer, compact)
|
|
|
|
|
const metadataLength = writer.offset - metadataStart
|
|
|
|
|
writer.appendUint32(metadataLength)
|
|
|
|
|
}
|