Update README

2026-02-14 01:31:31 +00:00 · 2025-04-14 23:22:55 -07:00 · 2025-04-14 23:22:55 -07:00 · 7e064bd7b0
commit 7e064bd7b0
parent 26daec2fcb
4 changed files with 56 additions and 24 deletions
--- a/README.md
+++ b/README.md
@ -24,7 +24,7 @@ import { parquetWriteBuffer } from 'hyparquet-writer'

 const arrayBuffer = parquetWriteBuffer({
  columnData: [
-    { name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'STRING' },
+    { name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'BYTE_ARRAY' },
    { name: 'age', data: [25, 30, 35], type: 'INT32' },
  ],
 })
@ -38,6 +38,9 @@ Note: if `type` is not provided, the type will be guessed from the data. The sup
 - `FLOAT`
 - `DOUBLE`
 - `BYTE_ARRAY`
+- `FIXED_LEN_BYTE_ARRAY`
+
+Strings are represented in parquet as type `BYTE_ARRAY`.

 ### Node.js Write to Local Parquet File

@ -49,7 +52,7 @@ const { parquetWriteFile } = await import('hyparquet-writer')
 parquetWriteFile({
  filename: 'example.parquet',
  columnData: [
-    { name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'STRING' },
+    { name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'BYTE_ARRAY' },
    { name: 'age', data: [25, 30, 35], type: 'INT32' },
  ],
 })
@ -62,7 +65,7 @@ Note: hyparquet-writer is published as an ES module, so dynamic `import()` may b
 Options can be passed to `parquetWrite` to adjust parquet file writing behavior:

 - `writer`: a generic writer object
- - `compression`: use snappy compression (default true)
+ - `compressed`: use snappy compression (default true)
 - `statistics`: write column statistics (default true)
 - `rowGroupSize`: number of rows in each row group (default 100000)
 - `kvMetadata`: extra key-value metadata to be stored in the parquet footer
@ -74,19 +77,44 @@ const writer = new ByteWriter()
 const arrayBuffer = parquetWrite({
  writer,
  columnData: [
-    { name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'STRING' },
+    { name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'BYTE_ARRAY' },
    { name: 'age', data: [25, 30, 35], type: 'INT32' },
  ],
-  compression: false,
+  compressed: false,
  statistics: false,
  rowGroupSize: 1000,
-  kvMetadata: {
-    'key1': 'value1',
-    'key2': 'value2',
-  },
+  kvMetadata: [
+    { key: 'key1', value: 'value1' },
+    { key: 'key2', value: 'value2' },
+  ],
 })
 ```

+### Converted Types
+
+You can provide additional type hints by providing a `converted_type` to the `columnData` elements:
+
+```javascript
+parquetWrite({
+  columnData: [
+    {
+      name: 'dates',
+      data: [new Date(1000000), new Date(2000000)],
+      type: 'INT64',
+      converted_type: 'TIMESTAMP_MILLIS',
+    },
+    {
+      name: 'json',
+      data: [{ foo: 'bar' }, { baz: 3 }, 'imastring'],
+      type: 'BYTE_ARRAY',
+      converted_type: 'JSON',
+    },
+  ]
+})
+```
+
+Most converted types will be auto-detected if you just provide data with no types. However, it is still recommended that you provide type information when possible. (zero rows would throw an exception, floats might be typed as int, etc)
+
 ## References

 - https://github.com/hyparam/hyparquet
--- a/src/column.js
+++ b/src/column.js
@ -18,6 +18,8 @@ export function writeColumn(writer, schemaPath, values, compressed, stats) {
  if (!type) throw new Error(`column ${schemaElement.name} cannot determine type`)
  const offsetStart = writer.offset
  const num_values = values.length
+  /** @type {Encoding[]} */
+  const encodings = []

  // Compute statistics
  const statistics = stats ? getStatistics(values) : undefined
@ -45,20 +47,19 @@ export function writeColumn(writer, schemaPath, values, compressed, stats) {
    // write data page with dictionary indexes
    data_page_offset = BigInt(writer.offset)
    writeDataPageV2(writer, indexes, type, schemaPath, 'RLE_DICTIONARY', compressed)
+    encodings.push('RLE_DICTIONARY')
  } else {
    // unconvert values from rich types to simple
    values = unconvert(schemaElement, values)

    // write data page
    writeDataPageV2(writer, values, type, schemaPath, 'PLAIN', compressed)
+    encodings.push('PLAIN')
  }

-  /** @type {import('hyparquet').Encoding} */
-  const encoding = dictionary ? 'RLE_DICTIONARY' : 'PLAIN'
-
  return {
    type,
-    encodings: [encoding],
+    encodings,
    path_in_schema: schemaPath.slice(1).map(s => s.name),
    codec: compressed ? 'SNAPPY' : 'UNCOMPRESSED',
    num_values: BigInt(num_values),
@ -106,8 +107,7 @@ function writeDictionaryPage(writer, dictionary, type, compressed) {
  }

  // write dictionary page header
-  /** @type {PageHeader} */
-  const dictionaryHeader = {
+  writePageHeader(writer, {
    type: 'DICTIONARY_PAGE',
    uncompressed_page_size: dictionaryPage.offset,
    compressed_page_size: compressedDictionaryPage.offset,
@ -115,13 +115,12 @@ function writeDictionaryPage(writer, dictionary, type, compressed) {
      num_values: dictionary.length,
      encoding: 'PLAIN',
    },
-  }
-  writePageHeader(writer, dictionaryHeader)
+  })
  writer.appendBuffer(compressedDictionaryPage.getBuffer())
 }

 /**
- * @import {ColumnMetaData, DecodedArray, PageHeader, ParquetType, SchemaElement, Statistics} from 'hyparquet'
+ * @import {ColumnMetaData, DecodedArray, Encoding, ParquetType, SchemaElement, Statistics} from 'hyparquet'
 * @import {Writer} from '../src/types.js'
 * @param {DecodedArray} values
 * @returns {Statistics}
--- a/src/datapage.js
+++ b/src/datapage.js
@ -44,8 +44,7 @@ export function writeDataPageV2(writer, values, type, schemaPath, encoding, comp
  }

  // write page header
-  /** @type {PageHeader} */
-  const header = {
+  writePageHeader(writer, {
    type: 'DATA_PAGE_V2',
    uncompressed_page_size: levels.offset + page.offset,
    compressed_page_size: levels.offset + compressedPage.offset,
@ -58,8 +57,7 @@ export function writeDataPageV2(writer, values, type, schemaPath, encoding, comp
      repetition_levels_byte_length,
      is_compressed: compressed,
    },
-  }
-  writePageHeader(writer, header)
+  })

  // write levels
  writer.appendBuffer(levels.getBuffer())
@ -69,7 +67,6 @@ export function writeDataPageV2(writer, values, type, schemaPath, encoding, comp
 }

 /**
- * @import {DecodedArray, PageHeader, ParquetType, SchemaElement} from 'hyparquet'
 * @param {Writer} writer
 * @param {PageHeader} header
 */
@ -105,6 +102,7 @@ export function writePageHeader(writer, header) {
 }

 /**
+ * @import {DecodedArray, PageHeader, ParquetType, SchemaElement} from 'hyparquet'
 * @param {Writer} writer
 * @param {SchemaElement[]} schemaPath
 * @param {DecodedArray} values
--- a/src/write.js
+++ b/src/write.js
@ -9,7 +9,14 @@ import { schemaFromColumnData } from './schema.js'
 * @import {ParquetWriteOptions} from '../src/types.js'
 * @param {ParquetWriteOptions} options
 */
-export function parquetWrite({ writer, columnData, compressed = true, statistics = true, rowGroupSize = 100000, kvMetadata }) {
+export function parquetWrite({
+  writer,
+  columnData,
+  compressed = true,
+  statistics = true,
+  rowGroupSize = 100000,
+  kvMetadata,
+}) {
  const schema = schemaFromColumnData(columnData)
  const pq = new ParquetWriter({
    writer,