From 69d373ad6132e56500e05aaf9de4efcafea1a84f Mon Sep 17 00:00:00 2001 From: Kenny Daniel Date: Thu, 27 Mar 2025 00:01:24 -0700 Subject: [PATCH] Use snappy compressed pages --- README.md | 2 +- src/column.js | 13 +++++++---- test/metadata.test.js | 54 +++++++++++++++++++++---------------------- test/write.test.js | 4 ++-- 4 files changed, 38 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 71c80be..484fdd9 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![npm](https://img.shields.io/npm/v/hyparquet-writer)](https://www.npmjs.com/package/hyparquet-writer) [![workflow status](https://github.com/hyparam/hyparquet-writer/actions/workflows/ci.yml/badge.svg)](https://github.com/hyparam/hyparquet-writer/actions) [![mit license](https://img.shields.io/badge/License-MIT-orange.svg)](https://opensource.org/licenses/MIT) -![coverage](https://img.shields.io/badge/Coverage-95-darkred) +![coverage](https://img.shields.io/badge/Coverage-96-darkred) [![dependencies](https://img.shields.io/badge/Dependencies-0-blueviolet)](https://www.npmjs.com/package/hyparquet?activeTab=dependencies) ## Usage diff --git a/src/column.js b/src/column.js index 2ee59fd..3f9163f 100644 --- a/src/column.js +++ b/src/column.js @@ -3,6 +3,7 @@ import { unconvert } from './convert.js' import { writeRleBitPackedHybrid } from './encoding.js' import { writePlain } from './plain.js' import { getMaxDefinitionLevel, getMaxRepetitionLevel } from './schema.js' +import { snappyCompress } from './snappy.js' import { serializeTCompactProtocol } from './thrift.js' import { Writer } from './writer.js' @@ -55,14 +56,16 @@ export function writeColumn(writer, schemaPath, values) { const page = new Writer() writePageData(page, values, type) - // TODO: compress page data + // compress page data + const compressed = new Writer() + snappyCompress(compressed, new Uint8Array(page.getBuffer())) // write page header /** @type {PageHeader} */ const header = { type: 'DATA_PAGE_V2', uncompressed_page_size: levels.offset + page.offset, - compressed_page_size: levels.offset + page.offset, + compressed_page_size: levels.offset + compressed.offset, data_page_header_v2: { num_values, num_nulls, @@ -70,7 +73,7 @@ export function writeColumn(writer, schemaPath, values) { encoding: 'PLAIN', definition_levels_byte_length, repetition_levels_byte_length, - is_compressed: false, + is_compressed: true, }, } writePageHeader(writer, header) @@ -79,13 +82,13 @@ export function writeColumn(writer, schemaPath, values) { writer.appendBuffer(levels.getBuffer()) // write page data - writer.appendBuffer(page.getBuffer()) + writer.appendBuffer(compressed.getBuffer()) return { type, encodings: ['PLAIN'], path_in_schema: schemaPath.slice(1).map(s => s.name), - codec: 'UNCOMPRESSED', + codec: 'SNAPPY', num_values: BigInt(num_values), total_compressed_size: BigInt(writer.offset - offsetStart), total_uncompressed_size: BigInt(writer.offset - offsetStart), diff --git a/test/metadata.test.js b/test/metadata.test.js index fa224e3..b72e204 100644 --- a/test/metadata.test.js +++ b/test/metadata.test.js @@ -29,85 +29,85 @@ export const exampleMetadata = { type: 'BOOLEAN', encodings: ['PLAIN'], path_in_schema: ['bool'], - codec: 'UNCOMPRESSED', + codec: 'SNAPPY', num_values: 4n, - total_uncompressed_size: 23n, - total_compressed_size: 23n, + total_uncompressed_size: 24n, + total_compressed_size: 24n, data_page_offset: 4n, }, }, { file_path: 'int', - file_offset: 27n, + file_offset: 28n, meta_data: { type: 'INT32', encodings: ['PLAIN'], path_in_schema: ['int'], - codec: 'UNCOMPRESSED', + codec: 'SNAPPY', num_values: 4n, - total_uncompressed_size: 38n, - total_compressed_size: 38n, - data_page_offset: 27n, + total_uncompressed_size: 39n, + total_compressed_size: 39n, + data_page_offset: 28n, }, }, { file_path: 'bigint', - file_offset: 65n, + file_offset: 67n, meta_data: { type: 'INT64', encodings: ['PLAIN'], path_in_schema: ['bigint'], - codec: 'UNCOMPRESSED', + codec: 'SNAPPY', num_values: 4n, - total_uncompressed_size: 54n, - total_compressed_size: 54n, - data_page_offset: 65n, + total_uncompressed_size: 43n, + total_compressed_size: 43n, + data_page_offset: 67n, }, }, { file_path: 'double', - file_offset: 119n, + file_offset: 110n, meta_data: { type: 'DOUBLE', encodings: ['PLAIN'], path_in_schema: ['double'], - codec: 'UNCOMPRESSED', + codec: 'SNAPPY', num_values: 4n, - total_uncompressed_size: 54n, - total_compressed_size: 54n, - data_page_offset: 119n, + total_uncompressed_size: 51n, + total_compressed_size: 51n, + data_page_offset: 110n, }, }, { file_path: 'string', - file_offset: 173n, + file_offset: 161n, meta_data: { type: 'BYTE_ARRAY', encodings: ['PLAIN'], path_in_schema: ['string'], - codec: 'UNCOMPRESSED', + codec: 'SNAPPY', num_values: 4n, total_uncompressed_size: 42n, total_compressed_size: 42n, - data_page_offset: 173n, + data_page_offset: 161n, }, }, { file_path: 'nullable', - file_offset: 215n, + file_offset: 203n, meta_data: { type: 'BOOLEAN', encodings: ['PLAIN'], path_in_schema: ['nullable'], - codec: 'UNCOMPRESSED', + codec: 'SNAPPY', num_values: 4n, - total_uncompressed_size: 25n, - total_compressed_size: 25n, - data_page_offset: 215n, + total_uncompressed_size: 26n, + total_compressed_size: 26n, + data_page_offset: 203n, }, }, ], - total_byte_size: 236n, + total_byte_size: 225n, num_rows: 4n, }], metadata_length: 338, diff --git a/test/write.test.js b/test/write.test.js index db180c7..ebc9ef8 100644 --- a/test/write.test.js +++ b/test/write.test.js @@ -48,7 +48,7 @@ describe('parquetWrite', () => { bool[500] = true bool[9999] = false const file = parquetWrite([{ name: 'bool', data: bool }]) - expect(file.byteLength).toBe(147) + expect(file.byteLength).toBe(148) const metadata = parquetMetadata(file) expect(metadata.metadata_length).toBe(86) const result = await parquetReadObjects({ file }) @@ -64,7 +64,7 @@ describe('parquetWrite', () => { it('efficiently serializes long string', () => { const str = 'a'.repeat(10000) const file = parquetWrite([{ name: 'string', data: [str] }]) - expect(file.byteLength).toBe(10136) + expect(file.byteLength).toBe(606) }) it('serializes list types', async () => {