hyparquet-writer/src/datapage.js

76 lines
2.9 KiB
JavaScript
Raw Normal View History

2025-04-13 20:44:48 +00:00
import { Encoding, PageType } from 'hyparquet/src/constants.js'
import { writeRleBitPackedHybrid } from './encoding.js'
import { serializeTCompactProtocol } from './thrift.js'
import { getMaxDefinitionLevel, getMaxRepetitionLevel } from './schema.js'
/**
* @import {DecodedArray, PageHeader, SchemaElement} from 'hyparquet'
* @import {Writer} from '../src/types.js'
* @param {Writer} writer
* @param {PageHeader} header
*/
export function writePageHeader(writer, header) {
/** @type {import('../src/types.js').ThriftObject} */
const compact = {
field_1: PageType.indexOf(header.type),
field_2: header.uncompressed_page_size,
field_3: header.compressed_page_size,
field_4: header.crc,
field_5: header.data_page_header && {
field_1: header.data_page_header.num_values,
field_2: Encoding.indexOf(header.data_page_header.encoding),
field_3: Encoding.indexOf(header.data_page_header.definition_level_encoding),
field_4: Encoding.indexOf(header.data_page_header.repetition_level_encoding),
// field_5: header.data_page_header.statistics,
},
field_7: header.dictionary_page_header && {
field_1: header.dictionary_page_header.num_values,
field_2: Encoding.indexOf(header.dictionary_page_header.encoding),
},
field_8: header.data_page_header_v2 && {
field_1: header.data_page_header_v2.num_values,
field_2: header.data_page_header_v2.num_nulls,
field_3: header.data_page_header_v2.num_rows,
field_4: Encoding.indexOf(header.data_page_header_v2.encoding),
field_5: header.data_page_header_v2.definition_levels_byte_length,
field_6: header.data_page_header_v2.repetition_levels_byte_length,
field_7: header.data_page_header_v2.is_compressed ? undefined : false, // default true
},
}
serializeTCompactProtocol(writer, compact)
}
/**
* @param {Writer} writer
* @param {SchemaElement[]} schemaPath
* @param {DecodedArray} values
* @returns {{ definition_levels_byte_length: number, repetition_levels_byte_length: number, num_nulls: number}}
*/
export function writeLevels(writer, schemaPath, values) {
let num_nulls = 0
// TODO: repetition levels
const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath)
let repetition_levels_byte_length = 0
if (maxRepetitionLevel) {
repetition_levels_byte_length = writeRleBitPackedHybrid(writer, [])
}
// definition levels
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath)
let definition_levels_byte_length = 0
if (maxDefinitionLevel) {
const definitionLevels = []
for (const value of values) {
if (value === null || value === undefined) {
definitionLevels.push(maxDefinitionLevel - 1)
num_nulls++
} else {
definitionLevels.push(maxDefinitionLevel)
}
}
definition_levels_byte_length = writeRleBitPackedHybrid(writer, definitionLevels)
}
return { definition_levels_byte_length, repetition_levels_byte_length, num_nulls }
}