hyparquet-writer/src/column.js

127 lines
3.9 KiB
JavaScript
Raw Normal View History

2025-03-26 04:06:43 +00:00
import { Encoding, PageType } from 'hyparquet/src/constants.js'
import { writeRleBitPackedHybrid } from './encoding.js'
import { writePlain } from './plain.js'
import { serializeTCompactProtocol } from './thrift.js'
import { Writer } from './writer.js'
/**
* @import {ColumnMetaData, DecodedArray, PageHeader, ParquetType} from 'hyparquet/src/types.js'
* @param {Writer} writer
* @param {string} columnName
* @param {DecodedArray} values
* @param {ParquetType} type
* @returns {ColumnMetaData}
*/
export function writeColumn(writer, columnName, values, type) {
// Get data stats
const num_nulls = values.filter(v => v === null).length
const offsetStart = writer.offset
// Write page to temp buffer
const page = new Writer()
/** @type {import('hyparquet/src/types.js').Encoding} */
const encoding = 'PLAIN'
// TODO: repetition levels
const maxRepetitionLevel = 0
let repetition_levels_byte_length = 0
if (maxRepetitionLevel) {
repetition_levels_byte_length = writeRleBitPackedHybrid(page, [])
}
// TODO: definition levels
const maxDefinitionLevel = 0
let definition_levels_byte_length = 0
if (maxDefinitionLevel) {
definition_levels_byte_length = writeRleBitPackedHybrid(page, [])
}
// write page data (TODO: compressed)
const { uncompressed_page_size, compressed_page_size } = writePageData(page, values, type)
// write page header
/** @type {PageHeader} */
const header = {
type: 'DATA_PAGE_V2',
uncompressed_page_size,
compressed_page_size,
data_page_header_v2: {
num_values: values.length,
num_nulls,
num_rows: values.length,
encoding,
definition_levels_byte_length,
repetition_levels_byte_length,
is_compressed: false,
},
}
writePageHeader(writer, header)
// write page data
writer.appendBuffer(page.getBuffer())
return {
type,
encodings: ['PLAIN'],
path_in_schema: [columnName],
codec: 'UNCOMPRESSED',
num_values: BigInt(values.length),
total_compressed_size: BigInt(writer.offset - offsetStart),
total_uncompressed_size: BigInt(writer.offset - offsetStart),
data_page_offset: BigInt(offsetStart),
}
}
/**
* Deduce a ParquetType from the JS value
*
* @param {DecodedArray} values
* @returns {ParquetType}
*/
export function getParquetTypeForValues(values) {
if (values.every(v => typeof v === 'boolean')) return 'BOOLEAN'
if (values.every(v => typeof v === 'bigint')) return 'INT64'
if (values.every(v => Number.isInteger(v))) return 'INT32'
if (values.every(v => typeof v === 'number')) return 'DOUBLE'
if (values.every(v => typeof v === 'string')) return 'BYTE_ARRAY'
throw new Error(`Cannot determine parquet type for: ${values}`)
}
/**
* @param {Writer} writer
* @param {PageHeader} header
*/
function writePageHeader(writer, header) {
const compact = {
field_1: PageType.indexOf(header.type),
field_2: header.uncompressed_page_size,
field_3: header.compressed_page_size,
field_8: header.data_page_header_v2 && {
field_1: header.data_page_header_v2.num_values,
field_2: header.data_page_header_v2.num_nulls,
field_3: header.data_page_header_v2.num_rows,
field_4: Encoding.indexOf(header.data_page_header_v2.encoding),
field_5: header.data_page_header_v2.definition_levels_byte_length,
field_6: header.data_page_header_v2.repetition_levels_byte_length,
field_7: header.data_page_header_v2.is_compressed ? undefined : false, // default true
},
}
serializeTCompactProtocol(writer, compact)
}
/**
* @param {Writer} writer
* @param {DecodedArray} values
* @param {ParquetType} type
* @returns {{ uncompressed_page_size: number, compressed_page_size: number }}
*/
function writePageData(writer, values, type) {
// write plain data
const startOffset = writer.offset
writePlain(writer, values, type)
const size = writer.offset - startOffset
return { uncompressed_page_size: size, compressed_page_size: size }
}