hyparquet-writer/src/write.js

import { writeColumn } from './column.js'
import { Writer } from './writer.js'
import { writeMetadata } from './metadata.js'
import { getSchemaElementForValues } from './schema.js'

/**
 * Write data as parquet to an ArrayBuffer
 *
 * @import {ColumnChunk, DecodedArray, FileMetaData, RowGroup, SchemaElement, SchemaTree} from 'hyparquet'
 * @import {KeyValue} from 'hyparquet/src/types.js'
 * @import {ColumnData} from '../src/types.js'
 * @param {object} options
 * @param {ColumnData[]} options.columnData
 * @param {boolean} [options.compressed]
 * @param {boolean} [options.statistics]
 * @param {number} [options.rowGroupSize]
 * @param {KeyValue[]} [options.kvMetadata]
 * @returns {ArrayBuffer}
 */
export function parquetWrite({ columnData, compressed = true, statistics = true, rowGroupSize = 100000, kvMetadata }) {
  const num_rows = columnData.length ? BigInt(columnData[0].data.length) : 0n
  const writer = new Writer()

  // construct schema
  /** @type {SchemaElement[]} */
  const schema = [{
    name: 'root',
    num_children: columnData.length,
  }]
  for (const { name, data, type } of columnData) {
    // check if all columns have the same length
    if (BigInt(data.length) !== num_rows) {
      throw new Error('columns must have the same length')
    }
    // auto-detect type
    const schemaElement = getSchemaElementForValues(name, data, type)
    if (!schemaElement.type) throw new Error(`column ${name} cannot determine type`)
    schema.push(schemaElement)
  }

  // write header PAR1
  writer.appendUint32(0x31524150)

  /** @type {RowGroup[]} */
  const row_groups = []
  for (let i = 0; i < num_rows; i += rowGroupSize) {
    const groupStart = writer.offset

    // row group columns
    /** @type {ColumnChunk[]} */
    const columns = []

    // write columns
    for (let i = 0; i < columnData.length; i++) {
      const { name, data } = columnData[i]
      const file_offset = BigInt(writer.offset)
      const schemaPath = [schema[0], schema[i + 1]]
      const meta_data = writeColumn(writer, schemaPath, data, compressed, statistics)

      // save metadata
      columns.push({
        file_path: name,
        file_offset,
        meta_data,
      })
    }

    row_groups.push({
      columns,
      total_byte_size: BigInt(writer.offset - groupStart),
      num_rows: BigInt(Math.min(rowGroupSize, Number(num_rows) - i)),
    })
  }

  // write metadata
  /** @type {FileMetaData} */
  const metadata = {
    version: 2,
    created_by: 'hyparquet',
    schema,
    num_rows,
    row_groups,
    metadata_length: 0,
    key_value_metadata: kvMetadata,
  }
  // @ts-ignore don't want to actually serialize metadata_length
  delete metadata.metadata_length
  writeMetadata(writer, metadata)

  // write footer PAR1
  writer.appendUint32(0x31524150)

  return writer.getBuffer()
}
Handle more types (date and json) Converts lists and object columns to json. 2025-03-26 07:11:14 +00:00			`import { writeColumn } from './column.js'`
Initial writer implementation 2025-03-26 04:06:43 +00:00			`import { Writer } from './writer.js'`
			`import { writeMetadata } from './metadata.js'`
Handle more types (date and json) Converts lists and object columns to json. 2025-03-26 07:11:14 +00:00			`import { getSchemaElementForValues } from './schema.js'`
Initial writer implementation 2025-03-26 04:06:43 +00:00
			`/**`
			`* Write data as parquet to an ArrayBuffer`
			`*`
rowGroupSize option 2025-04-03 07:42:54 +00:00			`* @import {ColumnChunk, DecodedArray, FileMetaData, RowGroup, SchemaElement, SchemaTree} from 'hyparquet'`
Use constants from hyparquet 2025-04-01 06:32:14 +00:00			`* @import {KeyValue} from 'hyparquet/src/types.js'`
			`* @import {ColumnData} from '../src/types.js'`
Optional compression flag 2025-03-27 07:27:22 +00:00			`* @param {object} options`
			`* @param {ColumnData[]} options.columnData`
			`* @param {boolean} [options.compressed]`
Write statistics 2025-04-03 20:21:57 +00:00			`* @param {boolean} [options.statistics]`
rowGroupSize option 2025-04-03 07:42:54 +00:00			`* @param {number} [options.rowGroupSize]`
Option for key_value_metadata 2025-03-31 20:42:57 +00:00			`* @param {KeyValue[]} [options.kvMetadata]`
Initial writer implementation 2025-03-26 04:06:43 +00:00			`* @returns {ArrayBuffer}`
			`*/`
Write statistics 2025-04-03 20:21:57 +00:00			`export function parquetWrite({ columnData, compressed = true, statistics = true, rowGroupSize = 100000, kvMetadata }) {`
Handle byte array vs string, and change parquetWrite column api 2025-03-26 07:45:22 +00:00			`const num_rows = columnData.length ? BigInt(columnData[0].data.length) : 0n`
rowGroupSize option 2025-04-03 07:42:54 +00:00			`const writer = new Writer()`
Initial writer implementation 2025-03-26 04:06:43 +00:00
rowGroupSize option 2025-04-03 07:42:54 +00:00			`// construct schema`
Initial writer implementation 2025-03-26 04:06:43 +00:00			`/** @type {SchemaElement[]} */`
			`const schema = [{`
			`name: 'root',`
Handle byte array vs string, and change parquetWrite column api 2025-03-26 07:45:22 +00:00			`num_children: columnData.length,`
Initial writer implementation 2025-03-26 04:06:43 +00:00			`}]`
Allow specifying column type 2025-03-28 23:13:27 +00:00			`for (const { name, data, type } of columnData) {`
rowGroupSize option 2025-04-03 07:42:54 +00:00			`// check if all columns have the same length`
			`if (BigInt(data.length) !== num_rows) {`
			`throw new Error('columns must have the same length')`
			`}`
Handle byte array vs string, and change parquetWrite column api 2025-03-26 07:45:22 +00:00			`// auto-detect type`
Allow specifying column type 2025-03-28 23:13:27 +00:00			`const schemaElement = getSchemaElementForValues(name, data, type)`
Handle more types (date and json) Converts lists and object columns to json. 2025-03-26 07:11:14 +00:00			if (!schemaElement.type) throw new Error(`column ${name} cannot determine type`)
			`schema.push(schemaElement)`
rowGroupSize option 2025-04-03 07:42:54 +00:00			`}`

			`// write header PAR1`
			`writer.appendUint32(0x31524150)`

			`/** @type {RowGroup[]} */`
			`const row_groups = []`
			`for (let i = 0; i < num_rows; i += rowGroupSize) {`
			`const groupStart = writer.offset`

			`// row group columns`
			`/** @type {ColumnChunk[]} */`
			`const columns = []`

			`// write columns`
			`for (let i = 0; i < columnData.length; i++) {`
			`const { name, data } = columnData[i]`
			`const file_offset = BigInt(writer.offset)`
			`const schemaPath = [schema[0], schema[i + 1]]`
Write statistics 2025-04-03 20:21:57 +00:00			`const meta_data = writeColumn(writer, schemaPath, data, compressed, statistics)`
rowGroupSize option 2025-04-03 07:42:54 +00:00
			`// save metadata`
			`columns.push({`
			`file_path: name,`
			`file_offset,`
			`meta_data,`
			`})`
			`}`

			`row_groups.push({`
			`columns,`
			`total_byte_size: BigInt(writer.offset - groupStart),`
			`num_rows: BigInt(Math.min(rowGroupSize, Number(num_rows) - i)),`
Initial writer implementation 2025-03-26 04:06:43 +00:00			`})`
			`}`

rowGroupSize option 2025-04-03 07:42:54 +00:00			`// write metadata`
Initial writer implementation 2025-03-26 04:06:43 +00:00			`/** @type {FileMetaData} */`
			`const metadata = {`
			`version: 2,`
			`created_by: 'hyparquet',`
			`schema,`
			`num_rows,`
rowGroupSize option 2025-04-03 07:42:54 +00:00			`row_groups,`
Initial writer implementation 2025-03-26 04:06:43 +00:00			`metadata_length: 0,`
Option for key_value_metadata 2025-03-31 20:42:57 +00:00			`key_value_metadata: kvMetadata,`
Initial writer implementation 2025-03-26 04:06:43 +00:00			`}`
			`// @ts-ignore don't want to actually serialize metadata_length`
			`delete metadata.metadata_length`
			`writeMetadata(writer, metadata)`

rowGroupSize option 2025-04-03 07:42:54 +00:00			`// write footer PAR1`
Initial writer implementation 2025-03-26 04:06:43 +00:00			`writer.appendUint32(0x31524150)`

			`return writer.getBuffer()`
			`}`