hyparquet-writer/src/metadata.js
2025-04-13 17:28:49 -07:00

138 lines
5.3 KiB
JavaScript

import { CompressionCodec, ConvertedType, Encoding, FieldRepetitionType, PageType, ParquetType } from 'hyparquet/src/constants.js'
import { serializeTCompactProtocol } from './thrift.js'
import { unconvertStatistics } from './unconvert.js'
/**
* @import {FileMetaData, LogicalType, TimeUnit} from 'hyparquet'
* @import {ThriftObject, Writer} from '../src/types.js'
* @param {Writer} writer
* @param {FileMetaData} metadata
*/
export function writeMetadata(writer, metadata) {
/** @type {ThriftObject} */
const compact = {
field_1: metadata.version,
field_2: metadata.schema && metadata.schema.map(element => ({
field_1: element.type && ParquetType.indexOf(element.type),
field_2: element.type_length,
field_3: element.repetition_type && FieldRepetitionType.indexOf(element.repetition_type),
field_4: element.name,
field_5: element.num_children,
field_6: element.converted_type && ConvertedType.indexOf(element.converted_type),
field_7: element.scale,
field_8: element.precision,
field_9: element.field_id,
field_10: logicalType(element.logical_type),
})),
field_3: metadata.num_rows,
field_4: metadata.row_groups.map(rg => ({
field_1: rg.columns.map((c, columnIndex) => ({
field_1: c.file_path,
field_2: c.file_offset,
field_3: c.meta_data && {
field_1: ParquetType.indexOf(c.meta_data.type),
field_2: c.meta_data.encodings.map(e => Encoding.indexOf(e)),
field_3: c.meta_data.path_in_schema,
field_4: CompressionCodec.indexOf(c.meta_data.codec),
field_5: c.meta_data.num_values,
field_6: c.meta_data.total_uncompressed_size,
field_7: c.meta_data.total_compressed_size,
field_8: c.meta_data.key_value_metadata && c.meta_data.key_value_metadata.map(kv => ({
field_1: kv.key,
field_2: kv.value,
})),
field_9: c.meta_data.data_page_offset,
field_10: c.meta_data.index_page_offset,
field_11: c.meta_data.dictionary_page_offset,
field_12: c.meta_data.statistics && unconvertStatistics(c.meta_data.statistics, metadata.schema[columnIndex + 1]),
field_13: c.meta_data.encoding_stats && c.meta_data.encoding_stats.map(es => ({
field_1: PageType.indexOf(es.page_type),
field_2: Encoding.indexOf(es.encoding),
field_3: es.count,
})),
field_14: c.meta_data.bloom_filter_offset,
field_15: c.meta_data.bloom_filter_length,
field_16: c.meta_data.size_statistics && {
field_1: c.meta_data.size_statistics.unencoded_byte_array_data_bytes,
field_2: c.meta_data.size_statistics.repetition_level_histogram,
field_3: c.meta_data.size_statistics.definition_level_histogram,
},
},
field_4: c.offset_index_offset,
field_5: c.offset_index_length,
field_6: c.column_index_offset,
field_7: c.column_index_length,
// field_8: c.crypto_metadata,
field_9: c.encrypted_column_metadata,
})),
field_2: rg.total_byte_size,
field_3: rg.num_rows,
field_4: rg.sorting_columns && rg.sorting_columns.map(sc => ({
field_1: sc.column_idx,
field_2: sc.descending,
field_3: sc.nulls_first,
})),
field_5: rg.file_offset,
field_6: rg.total_compressed_size,
// field_7: rg.ordinal, // should be int16
})),
field_5: metadata.key_value_metadata && metadata.key_value_metadata.map(kv => ({
field_1: kv.key,
field_2: kv.value,
})),
field_6: metadata.created_by,
}
const metadataStart = writer.offset
serializeTCompactProtocol(writer, compact)
const metadataLength = writer.offset - metadataStart
writer.appendUint32(metadataLength)
}
/**
* @param {LogicalType | undefined} type
* @returns {ThriftObject | undefined}
*/
export function logicalType(type) {
if (!type) return
if (type.type === 'STRING') return { field_1: {} }
if (type.type === 'MAP') return { field_2: {} }
if (type.type === 'LIST') return { field_3: {} }
if (type.type === 'ENUM') return { field_4: {} }
if (type.type === 'DECIMAL') return { field_5: {
field_1: type.scale,
field_2: type.precision,
} }
if (type.type === 'DATE') return { field_6: {} }
if (type.type === 'TIME') return { field_7: {
field_1: type.isAdjustedToUTC,
field_2: timeUnit(type.unit),
} }
if (type.type === 'TIMESTAMP') return { field_8: {
field_1: type.isAdjustedToUTC,
field_2: timeUnit(type.unit),
} }
if (type.type === 'INTEGER') return { field_10: {
field_1: type.bitWidth,
field_2: type.isSigned,
} }
if (type.type === 'NULL') return { field_11: {} }
if (type.type === 'JSON') return { field_12: {} }
if (type.type === 'BSON') return { field_13: {} }
if (type.type === 'UUID') return { field_14: {} }
if (type.type === 'FLOAT16') return { field_15: {} }
if (type.type === 'VARIANT') return { field_16: {} }
if (type.type === 'GEOMETRY') return { field_17: {} }
if (type.type === 'GEOGRAPHY') return { field_18: {} }
}
/**
* @param {TimeUnit} unit
* @returns {ThriftObject}
*/
function timeUnit(unit) {
if (unit === 'NANOS') return { field_3: {} }
if (unit === 'MICROS') return { field_2: {} }
return { field_1: {} }
}