hyparquet-writer/src/metadata.js

82 lines
2.8 KiB
JavaScript
Raw Normal View History

2025-03-26 04:06:43 +00:00
import { ConvertedType, Encoding, FieldRepetitionType, ParquetType } from 'hyparquet/src/constants.js'
2025-03-26 00:49:59 +00:00
import { serializeTCompactProtocol } from './thrift.js'
const CompressionCodec = [
'UNCOMPRESSED',
'SNAPPY',
'GZIP',
'LZO',
'BROTLI',
'LZ4',
'ZSTD',
'LZ4_RAW',
]
/**
* @import {FileMetaData} from 'hyparquet'
* @import {Writer} from './writer.js'
* @param {Writer} writer
* @param {FileMetaData} metadata
*/
export function writeMetadata(writer, metadata) {
const compact = {
field_1: metadata.version,
field_2: metadata.schema && metadata.schema.map(element => ({
field_1: element.type && ParquetType.indexOf(element.type),
field_2: element.type_length,
2025-03-26 03:15:14 +00:00
field_3: element.repetition_type && FieldRepetitionType.indexOf(element.repetition_type),
2025-03-26 00:49:59 +00:00
field_4: element.name,
field_5: element.num_children,
2025-03-26 04:06:43 +00:00
field_6: element.converted_type && ConvertedType.indexOf(element.converted_type),
2025-03-26 00:49:59 +00:00
field_7: element.scale,
field_8: element.precision,
field_9: element.field_id,
field_10: element.logical_type,
})),
field_3: metadata.num_rows,
field_4: metadata.row_groups.map(rg => ({
field_1: rg.columns.map(c => ({
field_1: c.file_path,
field_2: c.file_offset,
field_3: c.meta_data && {
field_1: ParquetType.indexOf(c.meta_data.type),
2025-03-26 04:06:43 +00:00
field_2: c.meta_data.encodings.map(e => Encoding.indexOf(e)),
2025-03-26 00:49:59 +00:00
field_3: c.meta_data.path_in_schema,
field_4: CompressionCodec.indexOf(c.meta_data.codec),
field_5: c.meta_data.num_values,
field_6: c.meta_data.total_uncompressed_size,
field_7: c.meta_data.total_compressed_size,
field_8: c.meta_data.key_value_metadata,
field_9: c.meta_data.data_page_offset,
field_10: c.meta_data.index_page_offset,
field_11: c.meta_data.dictionary_page_offset,
field_12: c.meta_data.statistics,
field_13: c.meta_data.encoding_stats,
field_14: c.meta_data.bloom_filter_offset,
field_15: c.meta_data.bloom_filter_length,
field_16: c.meta_data.size_statistics,
},
field_4: c.offset_index_offset,
field_5: c.offset_index_length,
field_6: c.column_index_offset,
field_7: c.column_index_length,
field_8: c.crypto_metadata,
field_9: c.encrypted_column_metadata,
})),
field_2: rg.total_byte_size,
field_3: rg.num_rows,
field_4: rg.sorting_columns,
field_5: rg.file_offset,
field_6: rg.total_compressed_size,
field_7: rg.ordinal,
})),
field_5: metadata.key_value_metadata,
field_6: metadata.created_by,
}
const metadataStart = writer.offset
serializeTCompactProtocol(writer, compact)
const metadataLength = writer.offset - metadataStart
writer.appendUint32(metadataLength)
}