mirror of
https://github.com/asadbek064/hyparquet-writer.git
synced 2025-12-05 23:31:54 +00:00
Optional compression flag
This commit is contained in:
parent
e3986a33c5
commit
2e0431d815
11
README.md
11
README.md
@ -1,6 +1,7 @@
|
||||
# Hyparquet Writer
|
||||
|
||||
[](https://www.npmjs.com/package/hyparquet-writer)
|
||||
[](https://www.npmjs.com/package/hyparquet-writer)
|
||||
[](https://github.com/hyparam/hyparquet-writer/actions)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||

|
||||
@ -13,10 +14,12 @@ Call `parquetWrite` with a list of columns, each column is an object with a `nam
|
||||
```javascript
|
||||
import { parquetWrite } from 'hyparquet-writer'
|
||||
|
||||
const arrayBuffer = parquetWrite([
|
||||
{ name: 'name', data: ['Alice', 'Bob', 'Charlie'] },
|
||||
{ name: 'age', data: [25, 30, 35] },
|
||||
])
|
||||
const arrayBuffer = parquetWrite({
|
||||
columnData: [
|
||||
{ name: 'name', data: ['Alice', 'Bob', 'Charlie'] },
|
||||
{ name: 'age', data: [25, 30, 35] },
|
||||
],
|
||||
})
|
||||
```
|
||||
|
||||
## References
|
||||
|
||||
@ -12,9 +12,10 @@ import { Writer } from './writer.js'
|
||||
* @param {Writer} writer
|
||||
* @param {SchemaElement[]} schemaPath
|
||||
* @param {DecodedArray} values
|
||||
* @param {boolean} compressed
|
||||
* @returns {ColumnMetaData}
|
||||
*/
|
||||
export function writeColumn(writer, schemaPath, values) {
|
||||
export function writeColumn(writer, schemaPath, values, compressed) {
|
||||
const schemaElement = schemaPath[schemaPath.length - 1]
|
||||
const { type } = schemaElement
|
||||
if (!type) throw new Error(`column ${schemaElement.name} cannot determine type`)
|
||||
@ -57,15 +58,18 @@ export function writeColumn(writer, schemaPath, values) {
|
||||
writePageData(page, values, type)
|
||||
|
||||
// compress page data
|
||||
const compressed = new Writer()
|
||||
snappyCompress(compressed, new Uint8Array(page.getBuffer()))
|
||||
let compressedPage = page
|
||||
if (compressed) {
|
||||
compressedPage = new Writer()
|
||||
snappyCompress(compressedPage, new Uint8Array(page.getBuffer()))
|
||||
}
|
||||
|
||||
// write page header
|
||||
/** @type {PageHeader} */
|
||||
const header = {
|
||||
type: 'DATA_PAGE_V2',
|
||||
uncompressed_page_size: levels.offset + page.offset,
|
||||
compressed_page_size: levels.offset + compressed.offset,
|
||||
compressed_page_size: levels.offset + compressedPage.offset,
|
||||
data_page_header_v2: {
|
||||
num_values,
|
||||
num_nulls,
|
||||
@ -82,13 +86,13 @@ export function writeColumn(writer, schemaPath, values) {
|
||||
writer.appendBuffer(levels.getBuffer())
|
||||
|
||||
// write page data
|
||||
writer.appendBuffer(compressed.getBuffer())
|
||||
writer.appendBuffer(compressedPage.getBuffer())
|
||||
|
||||
return {
|
||||
type,
|
||||
encodings: ['PLAIN'],
|
||||
path_in_schema: schemaPath.slice(1).map(s => s.name),
|
||||
codec: 'SNAPPY',
|
||||
codec: compressed ? 'SNAPPY' : 'UNCOMPRESSED',
|
||||
num_values: BigInt(num_values),
|
||||
total_compressed_size: BigInt(writer.offset - offsetStart),
|
||||
total_uncompressed_size: BigInt(writer.offset - offsetStart),
|
||||
|
||||
@ -8,10 +8,12 @@ import { getSchemaElementForValues } from './schema.js'
|
||||
*
|
||||
* @import {ColumnChunk, DecodedArray, FileMetaData, SchemaElement, SchemaTree} from 'hyparquet'
|
||||
* @import {ColumnData} from '../src/types.js'
|
||||
* @param {ColumnData[]} columnData
|
||||
* @param {object} options
|
||||
* @param {ColumnData[]} options.columnData
|
||||
* @param {boolean} [options.compressed]
|
||||
* @returns {ArrayBuffer}
|
||||
*/
|
||||
export function parquetWrite(columnData) {
|
||||
export function parquetWrite({ columnData, compressed = true }) {
|
||||
const writer = new Writer()
|
||||
|
||||
// Check if all columns have the same length
|
||||
@ -47,7 +49,7 @@ export function parquetWrite(columnData) {
|
||||
schema[0],
|
||||
schemaElement,
|
||||
]
|
||||
const meta_data = writeColumn(writer, schemaPath, data)
|
||||
const meta_data = writeColumn(writer, schemaPath, data, compressed)
|
||||
|
||||
// save metadata
|
||||
schema.push(schemaElement)
|
||||
|
||||
@ -11,7 +11,7 @@ import { exampleMetadata } from './metadata.test.js'
|
||||
* @returns {Promise<Record<string, any>>}
|
||||
*/
|
||||
async function roundTripDeserialize(columnData) {
|
||||
const file = parquetWrite(columnData)
|
||||
const file = parquetWrite({ columnData })
|
||||
return await parquetReadObjects({ file, utf8: false })
|
||||
}
|
||||
|
||||
@ -26,7 +26,7 @@ const basicData = [
|
||||
|
||||
describe('parquetWrite', () => {
|
||||
it('writes expected metadata', () => {
|
||||
const file = parquetWrite(basicData)
|
||||
const file = parquetWrite({ columnData: basicData })
|
||||
const metadata = parquetMetadata(file)
|
||||
expect(metadata).toEqual(exampleMetadata)
|
||||
})
|
||||
@ -47,7 +47,7 @@ describe('parquetWrite', () => {
|
||||
bool[100] = false
|
||||
bool[500] = true
|
||||
bool[9999] = false
|
||||
const file = parquetWrite([{ name: 'bool', data: bool }])
|
||||
const file = parquetWrite({ columnData: [{ name: 'bool', data: bool }] })
|
||||
expect(file.byteLength).toBe(148)
|
||||
const metadata = parquetMetadata(file)
|
||||
expect(metadata.metadata_length).toBe(86)
|
||||
@ -63,10 +63,23 @@ describe('parquetWrite', () => {
|
||||
|
||||
it('efficiently serializes long string', () => {
|
||||
const str = 'a'.repeat(10000)
|
||||
const file = parquetWrite([{ name: 'string', data: [str] }])
|
||||
const file = parquetWrite({ columnData: [{ name: 'string', data: [str] }] })
|
||||
expect(file.byteLength).toBe(606)
|
||||
})
|
||||
|
||||
it('less efficiently serializes string without compression', () => {
|
||||
const str = 'a'.repeat(10000)
|
||||
const columnData = [{ name: 'string', data: [str] }]
|
||||
const file = parquetWrite({ columnData, compressed: false })
|
||||
expect(file.byteLength).toBe(10135)
|
||||
})
|
||||
|
||||
it('efficiently represents column with few distinct values', () => {
|
||||
const data = Array(10000).fill('aaaa')
|
||||
const file = parquetWrite({ columnData: [{ name: 'string', data }] })
|
||||
expect(file.byteLength).toBe(3908)
|
||||
})
|
||||
|
||||
it('serializes list types', async () => {
|
||||
const result = await roundTripDeserialize([{
|
||||
name: 'list',
|
||||
@ -120,7 +133,7 @@ describe('parquetWrite', () => {
|
||||
})
|
||||
|
||||
it('throws for mixed types', () => {
|
||||
expect(() => parquetWrite([{ name: 'mixed', data: [1, 2, 3, 'boom'] }]))
|
||||
expect(() => parquetWrite({ columnData: [{ name: 'mixed', data: [1, 2, 3, 'boom'] }] }))
|
||||
.toThrow('mixed types not supported')
|
||||
})
|
||||
})
|
||||
|
||||
Loading…
Reference in New Issue
Block a user