2025-03-26 04:06:43 +00:00
|
|
|
import { Encoding, PageType } from 'hyparquet/src/constants.js'
|
2025-03-26 07:11:14 +00:00
|
|
|
import { unconvert } from './convert.js'
|
2025-03-26 04:06:43 +00:00
|
|
|
import { writeRleBitPackedHybrid } from './encoding.js'
|
|
|
|
|
import { writePlain } from './plain.js'
|
2025-03-27 06:46:40 +00:00
|
|
|
import { getMaxDefinitionLevel, getMaxRepetitionLevel } from './schema.js'
|
2025-03-26 04:06:43 +00:00
|
|
|
import { serializeTCompactProtocol } from './thrift.js'
|
|
|
|
|
import { Writer } from './writer.js'
|
|
|
|
|
|
|
|
|
|
/**
|
2025-03-26 07:11:14 +00:00
|
|
|
* @import {ColumnMetaData, DecodedArray, PageHeader, ParquetType, SchemaElement} from 'hyparquet'
|
2025-03-26 04:06:43 +00:00
|
|
|
* @param {Writer} writer
|
2025-03-26 07:11:14 +00:00
|
|
|
* @param {SchemaElement[]} schemaPath
|
2025-03-26 04:06:43 +00:00
|
|
|
* @param {DecodedArray} values
|
|
|
|
|
* @returns {ColumnMetaData}
|
|
|
|
|
*/
|
2025-03-26 07:11:14 +00:00
|
|
|
export function writeColumn(writer, schemaPath, values) {
|
|
|
|
|
const schemaElement = schemaPath[schemaPath.length - 1]
|
|
|
|
|
const { type } = schemaElement
|
|
|
|
|
if (!type) throw new Error(`column ${schemaElement.name} cannot determine type`)
|
2025-03-26 04:06:43 +00:00
|
|
|
const offsetStart = writer.offset
|
2025-03-27 06:46:40 +00:00
|
|
|
const num_values = values.length
|
2025-03-26 05:36:06 +00:00
|
|
|
let num_nulls = 0
|
2025-03-26 04:06:43 +00:00
|
|
|
|
2025-03-27 06:46:40 +00:00
|
|
|
// Write levels to temp buffer
|
|
|
|
|
const levels = new Writer()
|
2025-03-26 04:06:43 +00:00
|
|
|
|
|
|
|
|
// TODO: repetition levels
|
2025-03-26 05:36:06 +00:00
|
|
|
const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath)
|
2025-03-26 04:06:43 +00:00
|
|
|
let repetition_levels_byte_length = 0
|
|
|
|
|
if (maxRepetitionLevel) {
|
2025-03-27 06:46:40 +00:00
|
|
|
repetition_levels_byte_length = writeRleBitPackedHybrid(levels, [])
|
2025-03-26 04:06:43 +00:00
|
|
|
}
|
|
|
|
|
|
2025-03-26 07:11:14 +00:00
|
|
|
// definition levels
|
2025-03-26 05:36:06 +00:00
|
|
|
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath)
|
2025-03-26 04:06:43 +00:00
|
|
|
let definition_levels_byte_length = 0
|
|
|
|
|
if (maxDefinitionLevel) {
|
2025-03-26 05:36:06 +00:00
|
|
|
const definitionLevels = []
|
|
|
|
|
for (const value of values) {
|
|
|
|
|
if (value === null || value === undefined) {
|
|
|
|
|
definitionLevels.push(maxDefinitionLevel - 1)
|
|
|
|
|
num_nulls++
|
|
|
|
|
} else {
|
|
|
|
|
definitionLevels.push(maxDefinitionLevel)
|
|
|
|
|
}
|
|
|
|
|
}
|
2025-03-27 06:46:40 +00:00
|
|
|
definition_levels_byte_length = writeRleBitPackedHybrid(levels, definitionLevels)
|
2025-03-26 04:06:43 +00:00
|
|
|
}
|
|
|
|
|
|
2025-03-27 06:46:40 +00:00
|
|
|
// Unconvert type and filter out nulls
|
|
|
|
|
values = unconvert(schemaElement, values)
|
|
|
|
|
.filter(v => v !== null && v !== undefined)
|
|
|
|
|
|
|
|
|
|
// write page data to temp buffer
|
|
|
|
|
const page = new Writer()
|
2025-03-26 05:36:06 +00:00
|
|
|
writePageData(page, values, type)
|
|
|
|
|
|
|
|
|
|
// TODO: compress page data
|
2025-03-26 04:06:43 +00:00
|
|
|
|
|
|
|
|
// write page header
|
|
|
|
|
/** @type {PageHeader} */
|
|
|
|
|
const header = {
|
|
|
|
|
type: 'DATA_PAGE_V2',
|
2025-03-27 06:46:40 +00:00
|
|
|
uncompressed_page_size: levels.offset + page.offset,
|
|
|
|
|
compressed_page_size: levels.offset + page.offset,
|
2025-03-26 04:06:43 +00:00
|
|
|
data_page_header_v2: {
|
2025-03-27 06:46:40 +00:00
|
|
|
num_values,
|
2025-03-26 04:06:43 +00:00
|
|
|
num_nulls,
|
2025-03-27 06:46:40 +00:00
|
|
|
num_rows: num_values,
|
|
|
|
|
encoding: 'PLAIN',
|
2025-03-26 04:06:43 +00:00
|
|
|
definition_levels_byte_length,
|
|
|
|
|
repetition_levels_byte_length,
|
|
|
|
|
is_compressed: false,
|
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
writePageHeader(writer, header)
|
|
|
|
|
|
2025-03-27 06:46:40 +00:00
|
|
|
// write levels
|
|
|
|
|
writer.appendBuffer(levels.getBuffer())
|
|
|
|
|
|
2025-03-26 04:06:43 +00:00
|
|
|
// write page data
|
|
|
|
|
writer.appendBuffer(page.getBuffer())
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
type,
|
|
|
|
|
encodings: ['PLAIN'],
|
2025-03-26 05:36:06 +00:00
|
|
|
path_in_schema: schemaPath.slice(1).map(s => s.name),
|
2025-03-26 04:06:43 +00:00
|
|
|
codec: 'UNCOMPRESSED',
|
2025-03-27 06:46:40 +00:00
|
|
|
num_values: BigInt(num_values),
|
2025-03-26 04:06:43 +00:00
|
|
|
total_compressed_size: BigInt(writer.offset - offsetStart),
|
|
|
|
|
total_uncompressed_size: BigInt(writer.offset - offsetStart),
|
|
|
|
|
data_page_offset: BigInt(offsetStart),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param {Writer} writer
|
|
|
|
|
* @param {PageHeader} header
|
|
|
|
|
*/
|
|
|
|
|
function writePageHeader(writer, header) {
|
|
|
|
|
const compact = {
|
|
|
|
|
field_1: PageType.indexOf(header.type),
|
|
|
|
|
field_2: header.uncompressed_page_size,
|
|
|
|
|
field_3: header.compressed_page_size,
|
|
|
|
|
field_8: header.data_page_header_v2 && {
|
|
|
|
|
field_1: header.data_page_header_v2.num_values,
|
|
|
|
|
field_2: header.data_page_header_v2.num_nulls,
|
|
|
|
|
field_3: header.data_page_header_v2.num_rows,
|
|
|
|
|
field_4: Encoding.indexOf(header.data_page_header_v2.encoding),
|
|
|
|
|
field_5: header.data_page_header_v2.definition_levels_byte_length,
|
|
|
|
|
field_6: header.data_page_header_v2.repetition_levels_byte_length,
|
|
|
|
|
field_7: header.data_page_header_v2.is_compressed ? undefined : false, // default true
|
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
serializeTCompactProtocol(writer, compact)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param {Writer} writer
|
|
|
|
|
* @param {DecodedArray} values
|
|
|
|
|
* @param {ParquetType} type
|
|
|
|
|
*/
|
|
|
|
|
function writePageData(writer, values, type) {
|
|
|
|
|
// write plain data
|
|
|
|
|
writePlain(writer, values, type)
|
2025-03-26 05:36:06 +00:00
|
|
|
}
|