Update README

This commit is contained in:
Kenny Daniel 2025-04-14 23:22:55 -07:00
parent 26daec2fcb
commit 7e064bd7b0
No known key found for this signature in database
GPG Key ID: FDF16101AF5AFD3A
4 changed files with 56 additions and 24 deletions

@ -24,7 +24,7 @@ import { parquetWriteBuffer } from 'hyparquet-writer'
const arrayBuffer = parquetWriteBuffer({
columnData: [
{ name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'STRING' },
{ name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'BYTE_ARRAY' },
{ name: 'age', data: [25, 30, 35], type: 'INT32' },
],
})
@ -38,6 +38,9 @@ Note: if `type` is not provided, the type will be guessed from the data. The sup
- `FLOAT`
- `DOUBLE`
- `BYTE_ARRAY`
- `FIXED_LEN_BYTE_ARRAY`
Strings are represented in parquet as type `BYTE_ARRAY`.
### Node.js Write to Local Parquet File
@ -49,7 +52,7 @@ const { parquetWriteFile } = await import('hyparquet-writer')
parquetWriteFile({
filename: 'example.parquet',
columnData: [
{ name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'STRING' },
{ name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'BYTE_ARRAY' },
{ name: 'age', data: [25, 30, 35], type: 'INT32' },
],
})
@ -62,7 +65,7 @@ Note: hyparquet-writer is published as an ES module, so dynamic `import()` may b
Options can be passed to `parquetWrite` to adjust parquet file writing behavior:
- `writer`: a generic writer object
- `compression`: use snappy compression (default true)
- `compressed`: use snappy compression (default true)
- `statistics`: write column statistics (default true)
- `rowGroupSize`: number of rows in each row group (default 100000)
- `kvMetadata`: extra key-value metadata to be stored in the parquet footer
@ -74,19 +77,44 @@ const writer = new ByteWriter()
const arrayBuffer = parquetWrite({
writer,
columnData: [
{ name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'STRING' },
{ name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'BYTE_ARRAY' },
{ name: 'age', data: [25, 30, 35], type: 'INT32' },
],
compression: false,
compressed: false,
statistics: false,
rowGroupSize: 1000,
kvMetadata: {
'key1': 'value1',
'key2': 'value2',
},
kvMetadata: [
{ key: 'key1', value: 'value1' },
{ key: 'key2', value: 'value2' },
],
})
```
### Converted Types
You can provide additional type hints by providing a `converted_type` to the `columnData` elements:
```javascript
parquetWrite({
columnData: [
{
name: 'dates',
data: [new Date(1000000), new Date(2000000)],
type: 'INT64',
converted_type: 'TIMESTAMP_MILLIS',
},
{
name: 'json',
data: [{ foo: 'bar' }, { baz: 3 }, 'imastring'],
type: 'BYTE_ARRAY',
converted_type: 'JSON',
},
]
})
```
Most converted types will be auto-detected if you just provide data with no types. However, it is still recommended that you provide type information when possible. (zero rows would throw an exception, floats might be typed as int, etc)
## References
- https://github.com/hyparam/hyparquet

@ -18,6 +18,8 @@ export function writeColumn(writer, schemaPath, values, compressed, stats) {
if (!type) throw new Error(`column ${schemaElement.name} cannot determine type`)
const offsetStart = writer.offset
const num_values = values.length
/** @type {Encoding[]} */
const encodings = []
// Compute statistics
const statistics = stats ? getStatistics(values) : undefined
@ -45,20 +47,19 @@ export function writeColumn(writer, schemaPath, values, compressed, stats) {
// write data page with dictionary indexes
data_page_offset = BigInt(writer.offset)
writeDataPageV2(writer, indexes, type, schemaPath, 'RLE_DICTIONARY', compressed)
encodings.push('RLE_DICTIONARY')
} else {
// unconvert values from rich types to simple
values = unconvert(schemaElement, values)
// write data page
writeDataPageV2(writer, values, type, schemaPath, 'PLAIN', compressed)
encodings.push('PLAIN')
}
/** @type {import('hyparquet').Encoding} */
const encoding = dictionary ? 'RLE_DICTIONARY' : 'PLAIN'
return {
type,
encodings: [encoding],
encodings,
path_in_schema: schemaPath.slice(1).map(s => s.name),
codec: compressed ? 'SNAPPY' : 'UNCOMPRESSED',
num_values: BigInt(num_values),
@ -106,8 +107,7 @@ function writeDictionaryPage(writer, dictionary, type, compressed) {
}
// write dictionary page header
/** @type {PageHeader} */
const dictionaryHeader = {
writePageHeader(writer, {
type: 'DICTIONARY_PAGE',
uncompressed_page_size: dictionaryPage.offset,
compressed_page_size: compressedDictionaryPage.offset,
@ -115,13 +115,12 @@ function writeDictionaryPage(writer, dictionary, type, compressed) {
num_values: dictionary.length,
encoding: 'PLAIN',
},
}
writePageHeader(writer, dictionaryHeader)
})
writer.appendBuffer(compressedDictionaryPage.getBuffer())
}
/**
* @import {ColumnMetaData, DecodedArray, PageHeader, ParquetType, SchemaElement, Statistics} from 'hyparquet'
* @import {ColumnMetaData, DecodedArray, Encoding, ParquetType, SchemaElement, Statistics} from 'hyparquet'
* @import {Writer} from '../src/types.js'
* @param {DecodedArray} values
* @returns {Statistics}

@ -44,8 +44,7 @@ export function writeDataPageV2(writer, values, type, schemaPath, encoding, comp
}
// write page header
/** @type {PageHeader} */
const header = {
writePageHeader(writer, {
type: 'DATA_PAGE_V2',
uncompressed_page_size: levels.offset + page.offset,
compressed_page_size: levels.offset + compressedPage.offset,
@ -58,8 +57,7 @@ export function writeDataPageV2(writer, values, type, schemaPath, encoding, comp
repetition_levels_byte_length,
is_compressed: compressed,
},
}
writePageHeader(writer, header)
})
// write levels
writer.appendBuffer(levels.getBuffer())
@ -69,7 +67,6 @@ export function writeDataPageV2(writer, values, type, schemaPath, encoding, comp
}
/**
* @import {DecodedArray, PageHeader, ParquetType, SchemaElement} from 'hyparquet'
* @param {Writer} writer
* @param {PageHeader} header
*/
@ -105,6 +102,7 @@ export function writePageHeader(writer, header) {
}
/**
* @import {DecodedArray, PageHeader, ParquetType, SchemaElement} from 'hyparquet'
* @param {Writer} writer
* @param {SchemaElement[]} schemaPath
* @param {DecodedArray} values

@ -9,7 +9,14 @@ import { schemaFromColumnData } from './schema.js'
* @import {ParquetWriteOptions} from '../src/types.js'
* @param {ParquetWriteOptions} options
*/
export function parquetWrite({ writer, columnData, compressed = true, statistics = true, rowGroupSize = 100000, kvMetadata }) {
export function parquetWrite({
writer,
columnData,
compressed = true,
statistics = true,
rowGroupSize = 100000,
kvMetadata,
}) {
const schema = schemaFromColumnData(columnData)
const pq = new ParquetWriter({
writer,