From 7e064bd7b0d2a0c8c7699f2ee6469be71607707e Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Mon, 14 Apr 2025 23:22:55 -0700 Subject: [PATCH] Update README --- README.md | 46 +++++++++++++++++++++++++++++++++++++--------- src/column.js | 17 ++++++++--------- src/datapage.js | 8 +++----- src/write.js | 9 ++++++++- 4 files changed, 56 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index c8d7a4a..4d55321 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ import { parquetWriteBuffer } from 'hyparquet-writer' const arrayBuffer = parquetWriteBuffer({ columnData: [ - { name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'STRING' }, + { name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'BYTE_ARRAY' }, { name: 'age', data: [25, 30, 35], type: 'INT32' }, ], }) @@ -38,6 +38,9 @@ Note: if `type` is not provided, the type will be guessed from the data. The sup - `FLOAT` - `DOUBLE` - `BYTE_ARRAY` +- `FIXED_LEN_BYTE_ARRAY` + +Strings are represented in parquet as type `BYTE_ARRAY`. ### Node.js Write to Local Parquet File @@ -49,7 +52,7 @@ const { parquetWriteFile } = await import('hyparquet-writer') parquetWriteFile({ filename: 'example.parquet', columnData: [ - { name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'STRING' }, + { name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'BYTE_ARRAY' }, { name: 'age', data: [25, 30, 35], type: 'INT32' }, ], }) @@ -62,7 +65,7 @@ Note: hyparquet-writer is published as an ES module, so dynamic `import()` may b Options can be passed to `parquetWrite` to adjust parquet file writing behavior: - `writer`: a generic writer object - - `compression`: use snappy compression (default true) + - `compressed`: use snappy compression (default true) - `statistics`: write column statistics (default true) - `rowGroupSize`: number of rows in each row group (default 100000) - `kvMetadata`: extra key-value metadata to be stored in the parquet footer @@ -74,19 +77,44 @@ const writer = new ByteWriter() const arrayBuffer = parquetWrite({ writer, columnData: [ - { name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'STRING' }, + { name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'BYTE_ARRAY' }, { name: 'age', data: [25, 30, 35], type: 'INT32' }, ], - compression: false, + compressed: false, statistics: false, rowGroupSize: 1000, - kvMetadata: { - 'key1': 'value1', - 'key2': 'value2', - }, + kvMetadata: [ + { key: 'key1', value: 'value1' }, + { key: 'key2', value: 'value2' }, + ], }) ``` +### Converted Types + +You can provide additional type hints by providing a `converted_type` to the `columnData` elements: + +```javascript +parquetWrite({ + columnData: [ + { + name: 'dates', + data: [new Date(1000000), new Date(2000000)], + type: 'INT64', + converted_type: 'TIMESTAMP_MILLIS', + }, + { + name: 'json', + data: [{ foo: 'bar' }, { baz: 3 }, 'imastring'], + type: 'BYTE_ARRAY', + converted_type: 'JSON', + }, + ] +}) +``` + +Most converted types will be auto-detected if you just provide data with no types. However, it is still recommended that you provide type information when possible. (zero rows would throw an exception, floats might be typed as int, etc) + ## References - https://github.com/hyparam/hyparquet diff --git a/src/column.js b/src/column.js index 3ffd884..2388f26 100644 --- a/src/column.js +++ b/src/column.js @@ -18,6 +18,8 @@ export function writeColumn(writer, schemaPath, values, compressed, stats) { if (!type) throw new Error(`column ${schemaElement.name} cannot determine type`) const offsetStart = writer.offset const num_values = values.length + /** @type {Encoding[]} */ + const encodings = [] // Compute statistics const statistics = stats ? getStatistics(values) : undefined @@ -45,20 +47,19 @@ export function writeColumn(writer, schemaPath, values, compressed, stats) { // write data page with dictionary indexes data_page_offset = BigInt(writer.offset) writeDataPageV2(writer, indexes, type, schemaPath, 'RLE_DICTIONARY', compressed) + encodings.push('RLE_DICTIONARY') } else { // unconvert values from rich types to simple values = unconvert(schemaElement, values) // write data page writeDataPageV2(writer, values, type, schemaPath, 'PLAIN', compressed) + encodings.push('PLAIN') } - /** @type {import('hyparquet').Encoding} */ - const encoding = dictionary ? 'RLE_DICTIONARY' : 'PLAIN' - return { type, - encodings: [encoding], + encodings, path_in_schema: schemaPath.slice(1).map(s => s.name), codec: compressed ? 'SNAPPY' : 'UNCOMPRESSED', num_values: BigInt(num_values), @@ -106,8 +107,7 @@ function writeDictionaryPage(writer, dictionary, type, compressed) { } // write dictionary page header - /** @type {PageHeader} */ - const dictionaryHeader = { + writePageHeader(writer, { type: 'DICTIONARY_PAGE', uncompressed_page_size: dictionaryPage.offset, compressed_page_size: compressedDictionaryPage.offset, @@ -115,13 +115,12 @@ function writeDictionaryPage(writer, dictionary, type, compressed) { num_values: dictionary.length, encoding: 'PLAIN', }, - } - writePageHeader(writer, dictionaryHeader) + }) writer.appendBuffer(compressedDictionaryPage.getBuffer()) } /** - * @import {ColumnMetaData, DecodedArray, PageHeader, ParquetType, SchemaElement, Statistics} from 'hyparquet' + * @import {ColumnMetaData, DecodedArray, Encoding, ParquetType, SchemaElement, Statistics} from 'hyparquet' * @import {Writer} from '../src/types.js' * @param {DecodedArray} values * @returns {Statistics} diff --git a/src/datapage.js b/src/datapage.js index bfc9b90..0d3b996 100644 --- a/src/datapage.js +++ b/src/datapage.js @@ -44,8 +44,7 @@ export function writeDataPageV2(writer, values, type, schemaPath, encoding, comp } // write page header - /** @type {PageHeader} */ - const header = { + writePageHeader(writer, { type: 'DATA_PAGE_V2', uncompressed_page_size: levels.offset + page.offset, compressed_page_size: levels.offset + compressedPage.offset, @@ -58,8 +57,7 @@ export function writeDataPageV2(writer, values, type, schemaPath, encoding, comp repetition_levels_byte_length, is_compressed: compressed, }, - } - writePageHeader(writer, header) + }) // write levels writer.appendBuffer(levels.getBuffer()) @@ -69,7 +67,6 @@ export function writeDataPageV2(writer, values, type, schemaPath, encoding, comp } /** - * @import {DecodedArray, PageHeader, ParquetType, SchemaElement} from 'hyparquet' * @param {Writer} writer * @param {PageHeader} header */ @@ -105,6 +102,7 @@ export function writePageHeader(writer, header) { } /** + * @import {DecodedArray, PageHeader, ParquetType, SchemaElement} from 'hyparquet' * @param {Writer} writer * @param {SchemaElement[]} schemaPath * @param {DecodedArray} values diff --git a/src/write.js b/src/write.js index 21f2e3b..663fd69 100644 --- a/src/write.js +++ b/src/write.js @@ -9,7 +9,14 @@ import { schemaFromColumnData } from './schema.js' * @import {ParquetWriteOptions} from '../src/types.js' * @param {ParquetWriteOptions} options */ -export function parquetWrite({ writer, columnData, compressed = true, statistics = true, rowGroupSize = 100000, kvMetadata }) { +export function parquetWrite({ + writer, + columnData, + compressed = true, + statistics = true, + rowGroupSize = 100000, + kvMetadata, +}) { const schema = schemaFromColumnData(columnData) const pq = new ParquetWriter({ writer,