2025-03-26 07:11:14 +00:00
|
|
|
import { writeColumn } from './column.js'
|
2025-03-26 04:06:43 +00:00
|
|
|
import { Writer } from './writer.js'
|
|
|
|
|
import { writeMetadata } from './metadata.js'
|
2025-03-26 07:11:14 +00:00
|
|
|
import { getSchemaElementForValues } from './schema.js'
|
2025-03-26 04:06:43 +00:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Write data as parquet to an ArrayBuffer
|
|
|
|
|
*
|
2025-04-03 07:42:54 +00:00
|
|
|
* @import {ColumnChunk, DecodedArray, FileMetaData, RowGroup, SchemaElement, SchemaTree} from 'hyparquet'
|
2025-04-01 06:32:14 +00:00
|
|
|
* @import {KeyValue} from 'hyparquet/src/types.js'
|
|
|
|
|
* @import {ColumnData} from '../src/types.js'
|
2025-03-27 07:27:22 +00:00
|
|
|
* @param {object} options
|
|
|
|
|
* @param {ColumnData[]} options.columnData
|
|
|
|
|
* @param {boolean} [options.compressed]
|
2025-04-03 20:21:57 +00:00
|
|
|
* @param {boolean} [options.statistics]
|
2025-04-03 07:42:54 +00:00
|
|
|
* @param {number} [options.rowGroupSize]
|
2025-03-31 20:42:57 +00:00
|
|
|
* @param {KeyValue[]} [options.kvMetadata]
|
2025-03-26 04:06:43 +00:00
|
|
|
* @returns {ArrayBuffer}
|
|
|
|
|
*/
|
2025-04-03 20:21:57 +00:00
|
|
|
export function parquetWrite({ columnData, compressed = true, statistics = true, rowGroupSize = 100000, kvMetadata }) {
|
2025-03-26 07:45:22 +00:00
|
|
|
const num_rows = columnData.length ? BigInt(columnData[0].data.length) : 0n
|
2025-04-03 07:42:54 +00:00
|
|
|
const writer = new Writer()
|
2025-03-26 04:06:43 +00:00
|
|
|
|
2025-04-03 07:42:54 +00:00
|
|
|
// construct schema
|
2025-03-26 04:06:43 +00:00
|
|
|
/** @type {SchemaElement[]} */
|
|
|
|
|
const schema = [{
|
|
|
|
|
name: 'root',
|
2025-03-26 07:45:22 +00:00
|
|
|
num_children: columnData.length,
|
2025-03-26 04:06:43 +00:00
|
|
|
}]
|
2025-03-28 23:13:27 +00:00
|
|
|
for (const { name, data, type } of columnData) {
|
2025-04-03 07:42:54 +00:00
|
|
|
// check if all columns have the same length
|
|
|
|
|
if (BigInt(data.length) !== num_rows) {
|
|
|
|
|
throw new Error('columns must have the same length')
|
|
|
|
|
}
|
2025-03-26 07:45:22 +00:00
|
|
|
// auto-detect type
|
2025-03-28 23:13:27 +00:00
|
|
|
const schemaElement = getSchemaElementForValues(name, data, type)
|
2025-03-26 07:11:14 +00:00
|
|
|
if (!schemaElement.type) throw new Error(`column ${name} cannot determine type`)
|
|
|
|
|
schema.push(schemaElement)
|
2025-04-03 07:42:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// write header PAR1
|
|
|
|
|
writer.appendUint32(0x31524150)
|
|
|
|
|
|
|
|
|
|
/** @type {RowGroup[]} */
|
|
|
|
|
const row_groups = []
|
|
|
|
|
for (let i = 0; i < num_rows; i += rowGroupSize) {
|
|
|
|
|
const groupStart = writer.offset
|
|
|
|
|
|
|
|
|
|
// row group columns
|
|
|
|
|
/** @type {ColumnChunk[]} */
|
|
|
|
|
const columns = []
|
|
|
|
|
|
|
|
|
|
// write columns
|
|
|
|
|
for (let i = 0; i < columnData.length; i++) {
|
|
|
|
|
const { name, data } = columnData[i]
|
|
|
|
|
const file_offset = BigInt(writer.offset)
|
|
|
|
|
const schemaPath = [schema[0], schema[i + 1]]
|
2025-04-03 20:21:57 +00:00
|
|
|
const meta_data = writeColumn(writer, schemaPath, data, compressed, statistics)
|
2025-04-03 07:42:54 +00:00
|
|
|
|
|
|
|
|
// save metadata
|
|
|
|
|
columns.push({
|
|
|
|
|
file_path: name,
|
|
|
|
|
file_offset,
|
|
|
|
|
meta_data,
|
|
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
row_groups.push({
|
|
|
|
|
columns,
|
|
|
|
|
total_byte_size: BigInt(writer.offset - groupStart),
|
|
|
|
|
num_rows: BigInt(Math.min(rowGroupSize, Number(num_rows) - i)),
|
2025-03-26 04:06:43 +00:00
|
|
|
})
|
|
|
|
|
}
|
|
|
|
|
|
2025-04-03 07:42:54 +00:00
|
|
|
// write metadata
|
2025-03-26 04:06:43 +00:00
|
|
|
/** @type {FileMetaData} */
|
|
|
|
|
const metadata = {
|
|
|
|
|
version: 2,
|
|
|
|
|
created_by: 'hyparquet',
|
|
|
|
|
schema,
|
|
|
|
|
num_rows,
|
2025-04-03 07:42:54 +00:00
|
|
|
row_groups,
|
2025-03-26 04:06:43 +00:00
|
|
|
metadata_length: 0,
|
2025-03-31 20:42:57 +00:00
|
|
|
key_value_metadata: kvMetadata,
|
2025-03-26 04:06:43 +00:00
|
|
|
}
|
|
|
|
|
// @ts-ignore don't want to actually serialize metadata_length
|
|
|
|
|
delete metadata.metadata_length
|
|
|
|
|
writeMetadata(writer, metadata)
|
|
|
|
|
|
2025-04-03 07:42:54 +00:00
|
|
|
// write footer PAR1
|
2025-03-26 04:06:43 +00:00
|
|
|
writer.appendUint32(0x31524150)
|
|
|
|
|
|
|
|
|
|
return writer.getBuffer()
|
|
|
|
|
}
|