From 2e0431d81587ec6cb71e639b05ee7af3f017f9aa Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Thu, 27 Mar 2025 00:27:22 -0700 Subject: [PATCH] Optional compression flag --- README.md | 11 +++++++---- src/column.js | 16 ++++++++++------ src/write.js | 8 +++++--- test/write.test.js | 23 ++++++++++++++++++----- 4 files changed, 40 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 484fdd9..9594b93 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # Hyparquet Writer [![npm](https://img.shields.io/npm/v/hyparquet-writer)](https://www.npmjs.com/package/hyparquet-writer) +[![minzipped](https://img.shields.io/bundlephobia/minzip/hyparquet-writer)](https://www.npmjs.com/package/hyparquet-writer) [![workflow status](https://github.com/hyparam/hyparquet-writer/actions/workflows/ci.yml/badge.svg)](https://github.com/hyparam/hyparquet-writer/actions) [![mit license](https://img.shields.io/badge/License-MIT-orange.svg)](https://opensource.org/licenses/MIT) ![coverage](https://img.shields.io/badge/Coverage-96-darkred) @@ -13,10 +14,12 @@ Call `parquetWrite` with a list of columns, each column is an object with a `nam ```javascript import { parquetWrite } from 'hyparquet-writer' -const arrayBuffer = parquetWrite([ - { name: 'name', data: ['Alice', 'Bob', 'Charlie'] }, - { name: 'age', data: [25, 30, 35] }, -]) +const arrayBuffer = parquetWrite({ + columnData: [ + { name: 'name', data: ['Alice', 'Bob', 'Charlie'] }, + { name: 'age', data: [25, 30, 35] }, + ], +}) ``` ## References diff --git a/src/column.js b/src/column.js index 3f9163f..f41c000 100644 --- a/src/column.js +++ b/src/column.js @@ -12,9 +12,10 @@ import { Writer } from './writer.js' * @param {Writer} writer * @param {SchemaElement[]} schemaPath * @param {DecodedArray} values + * @param {boolean} compressed * @returns {ColumnMetaData} */ -export function writeColumn(writer, schemaPath, values) { +export function writeColumn(writer, schemaPath, values, compressed) { const schemaElement = schemaPath[schemaPath.length - 1] const { type } = schemaElement if (!type) throw new Error(`column ${schemaElement.name} cannot determine type`) @@ -57,15 +58,18 @@ export function writeColumn(writer, schemaPath, values) { writePageData(page, values, type) // compress page data - const compressed = new Writer() - snappyCompress(compressed, new Uint8Array(page.getBuffer())) + let compressedPage = page + if (compressed) { + compressedPage = new Writer() + snappyCompress(compressedPage, new Uint8Array(page.getBuffer())) + } // write page header /** @type {PageHeader} */ const header = { type: 'DATA_PAGE_V2', uncompressed_page_size: levels.offset + page.offset, - compressed_page_size: levels.offset + compressed.offset, + compressed_page_size: levels.offset + compressedPage.offset, data_page_header_v2: { num_values, num_nulls, @@ -82,13 +86,13 @@ export function writeColumn(writer, schemaPath, values) { writer.appendBuffer(levels.getBuffer()) // write page data - writer.appendBuffer(compressed.getBuffer()) + writer.appendBuffer(compressedPage.getBuffer()) return { type, encodings: ['PLAIN'], path_in_schema: schemaPath.slice(1).map(s => s.name), - codec: 'SNAPPY', + codec: compressed ? 'SNAPPY' : 'UNCOMPRESSED', num_values: BigInt(num_values), total_compressed_size: BigInt(writer.offset - offsetStart), total_uncompressed_size: BigInt(writer.offset - offsetStart), diff --git a/src/write.js b/src/write.js index f4a146e..ba8ad2e 100644 --- a/src/write.js +++ b/src/write.js @@ -8,10 +8,12 @@ import { getSchemaElementForValues } from './schema.js' * * @import {ColumnChunk, DecodedArray, FileMetaData, SchemaElement, SchemaTree} from 'hyparquet' * @import {ColumnData} from '../src/types.js' - * @param {ColumnData[]} columnData + * @param {object} options + * @param {ColumnData[]} options.columnData + * @param {boolean} [options.compressed] * @returns {ArrayBuffer} */ -export function parquetWrite(columnData) { +export function parquetWrite({ columnData, compressed = true }) { const writer = new Writer() // Check if all columns have the same length @@ -47,7 +49,7 @@ export function parquetWrite(columnData) { schema[0], schemaElement, ] - const meta_data = writeColumn(writer, schemaPath, data) + const meta_data = writeColumn(writer, schemaPath, data, compressed) // save metadata schema.push(schemaElement) diff --git a/test/write.test.js b/test/write.test.js index ebc9ef8..285c8be 100644 --- a/test/write.test.js +++ b/test/write.test.js @@ -11,7 +11,7 @@ import { exampleMetadata } from './metadata.test.js' * @returns {Promise>} */ async function roundTripDeserialize(columnData) { - const file = parquetWrite(columnData) + const file = parquetWrite({ columnData }) return await parquetReadObjects({ file, utf8: false }) } @@ -26,7 +26,7 @@ const basicData = [ describe('parquetWrite', () => { it('writes expected metadata', () => { - const file = parquetWrite(basicData) + const file = parquetWrite({ columnData: basicData }) const metadata = parquetMetadata(file) expect(metadata).toEqual(exampleMetadata) }) @@ -47,7 +47,7 @@ describe('parquetWrite', () => { bool[100] = false bool[500] = true bool[9999] = false - const file = parquetWrite([{ name: 'bool', data: bool }]) + const file = parquetWrite({ columnData: [{ name: 'bool', data: bool }] }) expect(file.byteLength).toBe(148) const metadata = parquetMetadata(file) expect(metadata.metadata_length).toBe(86) @@ -63,10 +63,23 @@ describe('parquetWrite', () => { it('efficiently serializes long string', () => { const str = 'a'.repeat(10000) - const file = parquetWrite([{ name: 'string', data: [str] }]) + const file = parquetWrite({ columnData: [{ name: 'string', data: [str] }] }) expect(file.byteLength).toBe(606) }) + it('less efficiently serializes string without compression', () => { + const str = 'a'.repeat(10000) + const columnData = [{ name: 'string', data: [str] }] + const file = parquetWrite({ columnData, compressed: false }) + expect(file.byteLength).toBe(10135) + }) + + it('efficiently represents column with few distinct values', () => { + const data = Array(10000).fill('aaaa') + const file = parquetWrite({ columnData: [{ name: 'string', data }] }) + expect(file.byteLength).toBe(3908) + }) + it('serializes list types', async () => { const result = await roundTripDeserialize([{ name: 'list', @@ -120,7 +133,7 @@ describe('parquetWrite', () => { }) it('throws for mixed types', () => { - expect(() => parquetWrite([{ name: 'mixed', data: [1, 2, 3, 'boom'] }])) + expect(() => parquetWrite({ columnData: [{ name: 'mixed', data: [1, 2, 3, 'boom'] }] })) .toThrow('mixed types not supported') }) })