2025-03-26 04:06:43 +00:00
|
|
|
import { getParquetTypeForValues, writeColumn } from './column.js'
|
|
|
|
|
import { Writer } from './writer.js'
|
|
|
|
|
import { writeMetadata } from './metadata.js'
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Write data as parquet to an ArrayBuffer
|
|
|
|
|
*
|
2025-03-26 05:36:06 +00:00
|
|
|
* @import {ColumnChunk, DecodedArray, FileMetaData, SchemaElement, SchemaTree} from 'hyparquet'
|
2025-03-26 04:06:43 +00:00
|
|
|
* @param {Record<string, DecodedArray>} columnData
|
|
|
|
|
* @returns {ArrayBuffer}
|
|
|
|
|
*/
|
|
|
|
|
export function parquetWrite(columnData) {
|
|
|
|
|
const writer = new Writer()
|
|
|
|
|
|
|
|
|
|
// Check if all columns have the same length
|
|
|
|
|
const columnNames = Object.keys(columnData)
|
|
|
|
|
const num_rows = columnNames.length ? BigInt(columnData[columnNames[0]].length) : 0n
|
|
|
|
|
for (const name of columnNames) {
|
|
|
|
|
if (BigInt(columnData[name].length) !== num_rows) {
|
|
|
|
|
throw new Error('parquetWrite: all columns must have the same length')
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Write header PAR1
|
|
|
|
|
writer.appendUint32(0x31524150)
|
|
|
|
|
|
|
|
|
|
// schema
|
|
|
|
|
/** @type {SchemaElement[]} */
|
|
|
|
|
const schema = [{
|
|
|
|
|
name: 'root',
|
|
|
|
|
num_children: columnNames.length,
|
|
|
|
|
}]
|
|
|
|
|
|
|
|
|
|
// row group columns
|
|
|
|
|
/** @type {ColumnChunk[]} */
|
|
|
|
|
const columns = []
|
|
|
|
|
|
|
|
|
|
// Write columns
|
|
|
|
|
for (const name of columnNames) {
|
|
|
|
|
const values = columnData[name]
|
2025-03-26 05:36:06 +00:00
|
|
|
const { type, repetition_type } = getParquetTypeForValues(values)
|
|
|
|
|
if (!type) throw new Error(`parquetWrite: empty column ${name} cannot determine type`)
|
2025-03-26 04:06:43 +00:00
|
|
|
const file_offset = BigInt(writer.offset)
|
2025-03-26 05:36:06 +00:00
|
|
|
/** @type {SchemaElement[]} */
|
|
|
|
|
const schemaElements = [
|
|
|
|
|
schema[0],
|
|
|
|
|
{ type, name, repetition_type, num_children: 0 },
|
|
|
|
|
]
|
|
|
|
|
const meta_data = writeColumn(writer, schemaElements, values, type)
|
2025-03-26 04:06:43 +00:00
|
|
|
|
|
|
|
|
// save metadata
|
|
|
|
|
schema.push({ type, name, repetition_type })
|
|
|
|
|
columns.push({
|
|
|
|
|
file_path: name,
|
|
|
|
|
file_offset,
|
|
|
|
|
meta_data,
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Write metadata
|
|
|
|
|
/** @type {FileMetaData} */
|
|
|
|
|
const metadata = {
|
|
|
|
|
version: 2,
|
|
|
|
|
created_by: 'hyparquet',
|
|
|
|
|
schema,
|
|
|
|
|
num_rows,
|
|
|
|
|
row_groups: [{
|
|
|
|
|
columns,
|
|
|
|
|
total_byte_size: BigInt(writer.offset - 4),
|
|
|
|
|
num_rows,
|
|
|
|
|
}],
|
|
|
|
|
metadata_length: 0,
|
|
|
|
|
}
|
|
|
|
|
// @ts-ignore don't want to actually serialize metadata_length
|
|
|
|
|
delete metadata.metadata_length
|
|
|
|
|
writeMetadata(writer, metadata)
|
|
|
|
|
|
|
|
|
|
// Write footer PAR1
|
|
|
|
|
writer.appendUint32(0x31524150)
|
|
|
|
|
|
|
|
|
|
return writer.getBuffer()
|
|
|
|
|
}
|