mirror of
https://github.com/asadbek064/hyparquet-writer.git
synced 2025-12-05 23:31:54 +00:00
Update README
This commit is contained in:
parent
26daec2fcb
commit
7e064bd7b0
46
README.md
46
README.md
@ -24,7 +24,7 @@ import { parquetWriteBuffer } from 'hyparquet-writer'
|
||||
|
||||
const arrayBuffer = parquetWriteBuffer({
|
||||
columnData: [
|
||||
{ name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'STRING' },
|
||||
{ name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'BYTE_ARRAY' },
|
||||
{ name: 'age', data: [25, 30, 35], type: 'INT32' },
|
||||
],
|
||||
})
|
||||
@ -38,6 +38,9 @@ Note: if `type` is not provided, the type will be guessed from the data. The sup
|
||||
- `FLOAT`
|
||||
- `DOUBLE`
|
||||
- `BYTE_ARRAY`
|
||||
- `FIXED_LEN_BYTE_ARRAY`
|
||||
|
||||
Strings are represented in parquet as type `BYTE_ARRAY`.
|
||||
|
||||
### Node.js Write to Local Parquet File
|
||||
|
||||
@ -49,7 +52,7 @@ const { parquetWriteFile } = await import('hyparquet-writer')
|
||||
parquetWriteFile({
|
||||
filename: 'example.parquet',
|
||||
columnData: [
|
||||
{ name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'STRING' },
|
||||
{ name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'BYTE_ARRAY' },
|
||||
{ name: 'age', data: [25, 30, 35], type: 'INT32' },
|
||||
],
|
||||
})
|
||||
@ -62,7 +65,7 @@ Note: hyparquet-writer is published as an ES module, so dynamic `import()` may b
|
||||
Options can be passed to `parquetWrite` to adjust parquet file writing behavior:
|
||||
|
||||
- `writer`: a generic writer object
|
||||
- `compression`: use snappy compression (default true)
|
||||
- `compressed`: use snappy compression (default true)
|
||||
- `statistics`: write column statistics (default true)
|
||||
- `rowGroupSize`: number of rows in each row group (default 100000)
|
||||
- `kvMetadata`: extra key-value metadata to be stored in the parquet footer
|
||||
@ -74,19 +77,44 @@ const writer = new ByteWriter()
|
||||
const arrayBuffer = parquetWrite({
|
||||
writer,
|
||||
columnData: [
|
||||
{ name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'STRING' },
|
||||
{ name: 'name', data: ['Alice', 'Bob', 'Charlie'], type: 'BYTE_ARRAY' },
|
||||
{ name: 'age', data: [25, 30, 35], type: 'INT32' },
|
||||
],
|
||||
compression: false,
|
||||
compressed: false,
|
||||
statistics: false,
|
||||
rowGroupSize: 1000,
|
||||
kvMetadata: {
|
||||
'key1': 'value1',
|
||||
'key2': 'value2',
|
||||
},
|
||||
kvMetadata: [
|
||||
{ key: 'key1', value: 'value1' },
|
||||
{ key: 'key2', value: 'value2' },
|
||||
],
|
||||
})
|
||||
```
|
||||
|
||||
### Converted Types
|
||||
|
||||
You can provide additional type hints by providing a `converted_type` to the `columnData` elements:
|
||||
|
||||
```javascript
|
||||
parquetWrite({
|
||||
columnData: [
|
||||
{
|
||||
name: 'dates',
|
||||
data: [new Date(1000000), new Date(2000000)],
|
||||
type: 'INT64',
|
||||
converted_type: 'TIMESTAMP_MILLIS',
|
||||
},
|
||||
{
|
||||
name: 'json',
|
||||
data: [{ foo: 'bar' }, { baz: 3 }, 'imastring'],
|
||||
type: 'BYTE_ARRAY',
|
||||
converted_type: 'JSON',
|
||||
},
|
||||
]
|
||||
})
|
||||
```
|
||||
|
||||
Most converted types will be auto-detected if you just provide data with no types. However, it is still recommended that you provide type information when possible. (zero rows would throw an exception, floats might be typed as int, etc)
|
||||
|
||||
## References
|
||||
|
||||
- https://github.com/hyparam/hyparquet
|
||||
|
||||
@ -18,6 +18,8 @@ export function writeColumn(writer, schemaPath, values, compressed, stats) {
|
||||
if (!type) throw new Error(`column ${schemaElement.name} cannot determine type`)
|
||||
const offsetStart = writer.offset
|
||||
const num_values = values.length
|
||||
/** @type {Encoding[]} */
|
||||
const encodings = []
|
||||
|
||||
// Compute statistics
|
||||
const statistics = stats ? getStatistics(values) : undefined
|
||||
@ -45,20 +47,19 @@ export function writeColumn(writer, schemaPath, values, compressed, stats) {
|
||||
// write data page with dictionary indexes
|
||||
data_page_offset = BigInt(writer.offset)
|
||||
writeDataPageV2(writer, indexes, type, schemaPath, 'RLE_DICTIONARY', compressed)
|
||||
encodings.push('RLE_DICTIONARY')
|
||||
} else {
|
||||
// unconvert values from rich types to simple
|
||||
values = unconvert(schemaElement, values)
|
||||
|
||||
// write data page
|
||||
writeDataPageV2(writer, values, type, schemaPath, 'PLAIN', compressed)
|
||||
encodings.push('PLAIN')
|
||||
}
|
||||
|
||||
/** @type {import('hyparquet').Encoding} */
|
||||
const encoding = dictionary ? 'RLE_DICTIONARY' : 'PLAIN'
|
||||
|
||||
return {
|
||||
type,
|
||||
encodings: [encoding],
|
||||
encodings,
|
||||
path_in_schema: schemaPath.slice(1).map(s => s.name),
|
||||
codec: compressed ? 'SNAPPY' : 'UNCOMPRESSED',
|
||||
num_values: BigInt(num_values),
|
||||
@ -106,8 +107,7 @@ function writeDictionaryPage(writer, dictionary, type, compressed) {
|
||||
}
|
||||
|
||||
// write dictionary page header
|
||||
/** @type {PageHeader} */
|
||||
const dictionaryHeader = {
|
||||
writePageHeader(writer, {
|
||||
type: 'DICTIONARY_PAGE',
|
||||
uncompressed_page_size: dictionaryPage.offset,
|
||||
compressed_page_size: compressedDictionaryPage.offset,
|
||||
@ -115,13 +115,12 @@ function writeDictionaryPage(writer, dictionary, type, compressed) {
|
||||
num_values: dictionary.length,
|
||||
encoding: 'PLAIN',
|
||||
},
|
||||
}
|
||||
writePageHeader(writer, dictionaryHeader)
|
||||
})
|
||||
writer.appendBuffer(compressedDictionaryPage.getBuffer())
|
||||
}
|
||||
|
||||
/**
|
||||
* @import {ColumnMetaData, DecodedArray, PageHeader, ParquetType, SchemaElement, Statistics} from 'hyparquet'
|
||||
* @import {ColumnMetaData, DecodedArray, Encoding, ParquetType, SchemaElement, Statistics} from 'hyparquet'
|
||||
* @import {Writer} from '../src/types.js'
|
||||
* @param {DecodedArray} values
|
||||
* @returns {Statistics}
|
||||
|
||||
@ -44,8 +44,7 @@ export function writeDataPageV2(writer, values, type, schemaPath, encoding, comp
|
||||
}
|
||||
|
||||
// write page header
|
||||
/** @type {PageHeader} */
|
||||
const header = {
|
||||
writePageHeader(writer, {
|
||||
type: 'DATA_PAGE_V2',
|
||||
uncompressed_page_size: levels.offset + page.offset,
|
||||
compressed_page_size: levels.offset + compressedPage.offset,
|
||||
@ -58,8 +57,7 @@ export function writeDataPageV2(writer, values, type, schemaPath, encoding, comp
|
||||
repetition_levels_byte_length,
|
||||
is_compressed: compressed,
|
||||
},
|
||||
}
|
||||
writePageHeader(writer, header)
|
||||
})
|
||||
|
||||
// write levels
|
||||
writer.appendBuffer(levels.getBuffer())
|
||||
@ -69,7 +67,6 @@ export function writeDataPageV2(writer, values, type, schemaPath, encoding, comp
|
||||
}
|
||||
|
||||
/**
|
||||
* @import {DecodedArray, PageHeader, ParquetType, SchemaElement} from 'hyparquet'
|
||||
* @param {Writer} writer
|
||||
* @param {PageHeader} header
|
||||
*/
|
||||
@ -105,6 +102,7 @@ export function writePageHeader(writer, header) {
|
||||
}
|
||||
|
||||
/**
|
||||
* @import {DecodedArray, PageHeader, ParquetType, SchemaElement} from 'hyparquet'
|
||||
* @param {Writer} writer
|
||||
* @param {SchemaElement[]} schemaPath
|
||||
* @param {DecodedArray} values
|
||||
|
||||
@ -9,7 +9,14 @@ import { schemaFromColumnData } from './schema.js'
|
||||
* @import {ParquetWriteOptions} from '../src/types.js'
|
||||
* @param {ParquetWriteOptions} options
|
||||
*/
|
||||
export function parquetWrite({ writer, columnData, compressed = true, statistics = true, rowGroupSize = 100000, kvMetadata }) {
|
||||
export function parquetWrite({
|
||||
writer,
|
||||
columnData,
|
||||
compressed = true,
|
||||
statistics = true,
|
||||
rowGroupSize = 100000,
|
||||
kvMetadata,
|
||||
}) {
|
||||
const schema = schemaFromColumnData(columnData)
|
||||
const pq = new ParquetWriter({
|
||||
writer,
|
||||
|
||||
Loading…
Reference in New Issue
Block a user