hyparquet-writer/src/write.js

62 lines
1.7 KiB
JavaScript
Raw Normal View History

import { getSchemaElementForValues } from './schema.js'
2025-04-07 08:02:21 +00:00
import { ParquetWriter } from './parquet-writer.js'
2025-03-26 04:06:43 +00:00
/**
* Write data as parquet to an ArrayBuffer
*
2025-04-03 07:42:54 +00:00
* @import {ColumnChunk, DecodedArray, FileMetaData, RowGroup, SchemaElement, SchemaTree} from 'hyparquet'
2025-04-01 06:32:14 +00:00
* @import {KeyValue} from 'hyparquet/src/types.js'
* @import {ColumnData} from '../src/types.js'
2025-03-27 07:27:22 +00:00
* @param {object} options
* @param {ColumnData[]} options.columnData
* @param {boolean} [options.compressed]
2025-04-03 20:21:57 +00:00
* @param {boolean} [options.statistics]
2025-04-03 07:42:54 +00:00
* @param {number} [options.rowGroupSize]
2025-03-31 20:42:57 +00:00
* @param {KeyValue[]} [options.kvMetadata]
2025-03-26 04:06:43 +00:00
* @returns {ArrayBuffer}
*/
2025-04-03 20:21:57 +00:00
export function parquetWrite({ columnData, compressed = true, statistics = true, rowGroupSize = 100000, kvMetadata }) {
2025-04-07 08:02:21 +00:00
const schema = schemaFromColumnData(columnData)
const writer = new ParquetWriter({
schema,
compressed,
statistics,
kvMetadata,
})
writer.write({
columnData,
rowGroupSize,
})
2025-03-26 04:06:43 +00:00
2025-04-07 08:02:21 +00:00
return writer.finish()
}
/**
* Convert column data to schema.
*
* @param {ColumnData[]} columnData
* @returns {SchemaElement[]}
*/
function schemaFromColumnData(columnData) {
2025-03-26 04:06:43 +00:00
/** @type {SchemaElement[]} */
const schema = [{
name: 'root',
num_children: columnData.length,
2025-03-26 04:06:43 +00:00
}]
2025-04-07 08:02:21 +00:00
let num_rows = 0
2025-03-28 23:13:27 +00:00
for (const { name, data, type } of columnData) {
2025-04-03 07:42:54 +00:00
// check if all columns have the same length
2025-04-07 08:02:21 +00:00
if (num_rows === 0) {
num_rows = data.length
} else if (num_rows !== data.length) {
2025-04-03 07:42:54 +00:00
throw new Error('columns must have the same length')
}
// auto-detect type
2025-03-28 23:13:27 +00:00
const schemaElement = getSchemaElementForValues(name, data, type)
if (!schemaElement.type) throw new Error(`column ${name} cannot determine type`)
schema.push(schemaElement)
2025-04-03 07:42:54 +00:00
}
2025-04-07 08:02:21 +00:00
return schema
2025-03-26 04:06:43 +00:00
}