2025-03-26 04:06:43 +00:00
|
|
|
import { Encoding, PageType } from 'hyparquet/src/constants.js'
|
|
|
|
|
import { writeRleBitPackedHybrid } from './encoding.js'
|
|
|
|
|
import { writePlain } from './plain.js'
|
|
|
|
|
import { serializeTCompactProtocol } from './thrift.js'
|
|
|
|
|
import { Writer } from './writer.js'
|
|
|
|
|
|
|
|
|
|
/**
|
2025-03-26 05:36:06 +00:00
|
|
|
* @import {ColumnMetaData, DecodedArray, FieldRepetitionType, PageHeader, ParquetType, SchemaElement} from 'hyparquet/src/types.js'
|
2025-03-26 04:06:43 +00:00
|
|
|
* @param {Writer} writer
|
2025-03-26 05:36:06 +00:00
|
|
|
* @param {SchemaElement[]} schemaPath schema path for the column
|
2025-03-26 04:06:43 +00:00
|
|
|
* @param {DecodedArray} values
|
|
|
|
|
* @param {ParquetType} type
|
|
|
|
|
* @returns {ColumnMetaData}
|
|
|
|
|
*/
|
2025-03-26 05:36:06 +00:00
|
|
|
export function writeColumn(writer, schemaPath, values, type) {
|
2025-03-26 04:06:43 +00:00
|
|
|
const offsetStart = writer.offset
|
2025-03-26 05:36:06 +00:00
|
|
|
let num_nulls = 0
|
2025-03-26 04:06:43 +00:00
|
|
|
|
|
|
|
|
// Write page to temp buffer
|
|
|
|
|
const page = new Writer()
|
|
|
|
|
|
|
|
|
|
/** @type {import('hyparquet/src/types.js').Encoding} */
|
|
|
|
|
const encoding = 'PLAIN'
|
|
|
|
|
|
|
|
|
|
// TODO: repetition levels
|
2025-03-26 05:36:06 +00:00
|
|
|
const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath)
|
2025-03-26 04:06:43 +00:00
|
|
|
let repetition_levels_byte_length = 0
|
|
|
|
|
if (maxRepetitionLevel) {
|
|
|
|
|
repetition_levels_byte_length = writeRleBitPackedHybrid(page, [])
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// TODO: definition levels
|
2025-03-26 05:36:06 +00:00
|
|
|
const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath)
|
2025-03-26 04:06:43 +00:00
|
|
|
let definition_levels_byte_length = 0
|
|
|
|
|
if (maxDefinitionLevel) {
|
2025-03-26 05:36:06 +00:00
|
|
|
const definitionLevels = []
|
|
|
|
|
for (const value of values) {
|
|
|
|
|
if (value === null || value === undefined) {
|
|
|
|
|
definitionLevels.push(maxDefinitionLevel - 1)
|
|
|
|
|
num_nulls++
|
|
|
|
|
} else {
|
|
|
|
|
definitionLevels.push(maxDefinitionLevel)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
definition_levels_byte_length = writeRleBitPackedHybrid(page, definitionLevels)
|
2025-03-26 04:06:43 +00:00
|
|
|
}
|
|
|
|
|
|
2025-03-26 05:36:06 +00:00
|
|
|
// write page data
|
|
|
|
|
writePageData(page, values, type)
|
|
|
|
|
|
|
|
|
|
// TODO: compress page data
|
2025-03-26 04:06:43 +00:00
|
|
|
|
|
|
|
|
// write page header
|
|
|
|
|
/** @type {PageHeader} */
|
|
|
|
|
const header = {
|
|
|
|
|
type: 'DATA_PAGE_V2',
|
2025-03-26 05:36:06 +00:00
|
|
|
uncompressed_page_size: page.offset,
|
|
|
|
|
compressed_page_size: page.offset,
|
2025-03-26 04:06:43 +00:00
|
|
|
data_page_header_v2: {
|
|
|
|
|
num_values: values.length,
|
|
|
|
|
num_nulls,
|
|
|
|
|
num_rows: values.length,
|
|
|
|
|
encoding,
|
|
|
|
|
definition_levels_byte_length,
|
|
|
|
|
repetition_levels_byte_length,
|
|
|
|
|
is_compressed: false,
|
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
writePageHeader(writer, header)
|
|
|
|
|
|
|
|
|
|
// write page data
|
|
|
|
|
writer.appendBuffer(page.getBuffer())
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
type,
|
|
|
|
|
encodings: ['PLAIN'],
|
2025-03-26 05:36:06 +00:00
|
|
|
path_in_schema: schemaPath.slice(1).map(s => s.name),
|
2025-03-26 04:06:43 +00:00
|
|
|
codec: 'UNCOMPRESSED',
|
|
|
|
|
num_values: BigInt(values.length),
|
|
|
|
|
total_compressed_size: BigInt(writer.offset - offsetStart),
|
|
|
|
|
total_uncompressed_size: BigInt(writer.offset - offsetStart),
|
|
|
|
|
data_page_offset: BigInt(offsetStart),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
2025-03-26 05:36:06 +00:00
|
|
|
* Deduce a ParquetType from JS values
|
2025-03-26 04:06:43 +00:00
|
|
|
*
|
|
|
|
|
* @param {DecodedArray} values
|
2025-03-26 05:36:06 +00:00
|
|
|
* @returns {{ type: ParquetType, repetition_type: 'REQUIRED' | 'OPTIONAL' }}
|
2025-03-26 04:06:43 +00:00
|
|
|
*/
|
|
|
|
|
export function getParquetTypeForValues(values) {
|
2025-03-26 05:36:06 +00:00
|
|
|
if (values instanceof Int32Array) return { type: 'INT32', repetition_type: 'REQUIRED' }
|
|
|
|
|
if (values instanceof BigInt64Array) return { type: 'INT64', repetition_type: 'REQUIRED' }
|
|
|
|
|
if (values instanceof Float32Array) return { type: 'FLOAT', repetition_type: 'REQUIRED' }
|
|
|
|
|
if (values instanceof Float64Array) return { type: 'DOUBLE', repetition_type: 'REQUIRED' }
|
|
|
|
|
/** @type {ParquetType | undefined} */
|
|
|
|
|
let type = undefined
|
|
|
|
|
/** @type {FieldRepetitionType} */
|
|
|
|
|
let repetition_type = 'REQUIRED'
|
|
|
|
|
for (const value of values) {
|
|
|
|
|
const valueType = getParquetTypeForValue(value)
|
|
|
|
|
if (!valueType) {
|
|
|
|
|
repetition_type = 'OPTIONAL'
|
|
|
|
|
} else if (type === undefined) {
|
|
|
|
|
type = valueType
|
|
|
|
|
} else if (type === 'INT32' && valueType === 'DOUBLE') {
|
|
|
|
|
type = 'DOUBLE'
|
|
|
|
|
} else if (type === 'DOUBLE' && valueType === 'INT32') {
|
|
|
|
|
// keep
|
|
|
|
|
} else if (type !== valueType) {
|
|
|
|
|
throw new Error(`parquet cannot write mixed types: ${type} and ${valueType}`)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (!type) throw new Error('parquetWrite: empty column cannot determine type')
|
|
|
|
|
return { type, repetition_type }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param {any} value
|
|
|
|
|
* @returns {ParquetType | undefined}
|
|
|
|
|
*/
|
|
|
|
|
function getParquetTypeForValue(value) {
|
|
|
|
|
if (value === null || value === undefined) return undefined
|
|
|
|
|
if (value === true || value === false) return 'BOOLEAN'
|
|
|
|
|
if (typeof value === 'bigint') return 'INT64'
|
|
|
|
|
if (Number.isInteger(value)) return 'INT32'
|
|
|
|
|
if (typeof value === 'number') return 'DOUBLE'
|
|
|
|
|
if (typeof value === 'string') return 'BYTE_ARRAY'
|
|
|
|
|
throw new Error(`Cannot determine parquet type for: ${value}`)
|
2025-03-26 04:06:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param {Writer} writer
|
|
|
|
|
* @param {PageHeader} header
|
|
|
|
|
*/
|
|
|
|
|
function writePageHeader(writer, header) {
|
|
|
|
|
const compact = {
|
|
|
|
|
field_1: PageType.indexOf(header.type),
|
|
|
|
|
field_2: header.uncompressed_page_size,
|
|
|
|
|
field_3: header.compressed_page_size,
|
|
|
|
|
field_8: header.data_page_header_v2 && {
|
|
|
|
|
field_1: header.data_page_header_v2.num_values,
|
|
|
|
|
field_2: header.data_page_header_v2.num_nulls,
|
|
|
|
|
field_3: header.data_page_header_v2.num_rows,
|
|
|
|
|
field_4: Encoding.indexOf(header.data_page_header_v2.encoding),
|
|
|
|
|
field_5: header.data_page_header_v2.definition_levels_byte_length,
|
|
|
|
|
field_6: header.data_page_header_v2.repetition_levels_byte_length,
|
|
|
|
|
field_7: header.data_page_header_v2.is_compressed ? undefined : false, // default true
|
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
serializeTCompactProtocol(writer, compact)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param {Writer} writer
|
|
|
|
|
* @param {DecodedArray} values
|
|
|
|
|
* @param {ParquetType} type
|
|
|
|
|
*/
|
|
|
|
|
function writePageData(writer, values, type) {
|
|
|
|
|
// write plain data
|
|
|
|
|
writePlain(writer, values, type)
|
2025-03-26 05:36:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Get the max repetition level for a given schema path.
|
|
|
|
|
*
|
|
|
|
|
* @param {SchemaElement[]} schemaPath
|
|
|
|
|
* @returns {number} max repetition level
|
|
|
|
|
*/
|
|
|
|
|
function getMaxRepetitionLevel(schemaPath) {
|
|
|
|
|
let maxLevel = 0
|
|
|
|
|
for (const element of schemaPath) {
|
|
|
|
|
if (element.repetition_type === 'REPEATED') {
|
|
|
|
|
maxLevel++
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return maxLevel
|
|
|
|
|
}
|
2025-03-26 04:06:43 +00:00
|
|
|
|
2025-03-26 05:36:06 +00:00
|
|
|
/**
|
|
|
|
|
* Get the max definition level for a given schema path.
|
|
|
|
|
*
|
|
|
|
|
* @param {SchemaElement[]} schemaPath
|
|
|
|
|
* @returns {number} max definition level
|
|
|
|
|
*/
|
|
|
|
|
function getMaxDefinitionLevel(schemaPath) {
|
|
|
|
|
let maxLevel = 0
|
|
|
|
|
for (const element of schemaPath.slice(1)) {
|
|
|
|
|
if (element.repetition_type !== 'REQUIRED') {
|
|
|
|
|
maxLevel++
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return maxLevel
|
2025-03-26 04:06:43 +00:00
|
|
|
}
|