hyparquet-writer/src/write.js

86 lines
2.2 KiB
JavaScript
Raw Normal View History

import { writeColumn } from './column.js'
2025-03-26 04:06:43 +00:00
import { Writer } from './writer.js'
import { writeMetadata } from './metadata.js'
import { getSchemaElementForValues } from './schema.js'
2025-03-26 04:06:43 +00:00
/**
* Write data as parquet to an ArrayBuffer
*
2025-03-26 05:36:06 +00:00
* @import {ColumnChunk, DecodedArray, FileMetaData, SchemaElement, SchemaTree} from 'hyparquet'
* @import {ColumnData} from '../src/types.js'
2025-03-27 07:27:22 +00:00
* @param {object} options
* @param {ColumnData[]} options.columnData
* @param {boolean} [options.compressed]
2025-03-26 04:06:43 +00:00
* @returns {ArrayBuffer}
*/
2025-03-27 07:27:22 +00:00
export function parquetWrite({ columnData, compressed = true }) {
2025-03-26 04:06:43 +00:00
const writer = new Writer()
// Check if all columns have the same length
const num_rows = columnData.length ? BigInt(columnData[0].data.length) : 0n
for (const { data } of columnData) {
if (BigInt(data.length) !== num_rows) {
2025-03-29 19:28:25 +00:00
throw new Error('columns must have the same length')
2025-03-26 04:06:43 +00:00
}
}
// Write header PAR1
writer.appendUint32(0x31524150)
// schema
/** @type {SchemaElement[]} */
const schema = [{
name: 'root',
num_children: columnData.length,
2025-03-26 04:06:43 +00:00
}]
// row group columns
/** @type {ColumnChunk[]} */
const columns = []
// Write columns
2025-03-28 23:13:27 +00:00
for (const { name, data, type } of columnData) {
// auto-detect type
2025-03-28 23:13:27 +00:00
const schemaElement = getSchemaElementForValues(name, data, type)
if (!schemaElement.type) throw new Error(`column ${name} cannot determine type`)
2025-03-26 04:06:43 +00:00
const file_offset = BigInt(writer.offset)
2025-03-26 05:36:06 +00:00
/** @type {SchemaElement[]} */
const schemaPath = [
2025-03-26 05:36:06 +00:00
schema[0],
schemaElement,
2025-03-26 05:36:06 +00:00
]
2025-03-27 07:27:22 +00:00
const meta_data = writeColumn(writer, schemaPath, data, compressed)
2025-03-26 04:06:43 +00:00
// save metadata
schema.push(schemaElement)
2025-03-26 04:06:43 +00:00
columns.push({
file_path: name,
file_offset,
meta_data,
})
}
// Write metadata
/** @type {FileMetaData} */
const metadata = {
version: 2,
created_by: 'hyparquet',
schema,
num_rows,
row_groups: [{
columns,
total_byte_size: BigInt(writer.offset - 4),
num_rows,
}],
metadata_length: 0,
}
// @ts-ignore don't want to actually serialize metadata_length
delete metadata.metadata_length
writeMetadata(writer, metadata)
// Write footer PAR1
writer.appendUint32(0x31524150)
return writer.getBuffer()
}