Use snappy compressed pages

This commit is contained in:
Kenny Daniel 2025-03-27 00:01:24 -07:00
parent aca7c2bde4
commit 69d373ad61
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
4 changed files with 38 additions and 35 deletions

@ -3,7 +3,7 @@
[![npm](https://img.shields.io/npm/v/hyparquet-writer)](https://www.npmjs.com/package/hyparquet-writer)
[![workflow status](https://github.com/hyparam/hyparquet-writer/actions/workflows/ci.yml/badge.svg)](https://github.com/hyparam/hyparquet-writer/actions)
[![mit license](https://img.shields.io/badge/License-MIT-orange.svg)](https://opensource.org/licenses/MIT)
![coverage](https://img.shields.io/badge/Coverage-95-darkred)
![coverage](https://img.shields.io/badge/Coverage-96-darkred)
[![dependencies](https://img.shields.io/badge/Dependencies-0-blueviolet)](https://www.npmjs.com/package/hyparquet?activeTab=dependencies)
## Usage

@ -3,6 +3,7 @@ import { unconvert } from './convert.js'
import { writeRleBitPackedHybrid } from './encoding.js'
import { writePlain } from './plain.js'
import { getMaxDefinitionLevel, getMaxRepetitionLevel } from './schema.js'
import { snappyCompress } from './snappy.js'
import { serializeTCompactProtocol } from './thrift.js'
import { Writer } from './writer.js'
@ -55,14 +56,16 @@ export function writeColumn(writer, schemaPath, values) {
const page = new Writer()
writePageData(page, values, type)
// TODO: compress page data
// compress page data
const compressed = new Writer()
snappyCompress(compressed, new Uint8Array(page.getBuffer()))
// write page header
/** @type {PageHeader} */
const header = {
type: 'DATA_PAGE_V2',
uncompressed_page_size: levels.offset + page.offset,
compressed_page_size: levels.offset + page.offset,
compressed_page_size: levels.offset + compressed.offset,
data_page_header_v2: {
num_values,
num_nulls,
@ -70,7 +73,7 @@ export function writeColumn(writer, schemaPath, values) {
encoding: 'PLAIN',
definition_levels_byte_length,
repetition_levels_byte_length,
is_compressed: false,
is_compressed: true,
},
}
writePageHeader(writer, header)
@ -79,13 +82,13 @@ export function writeColumn(writer, schemaPath, values) {
writer.appendBuffer(levels.getBuffer())
// write page data
writer.appendBuffer(page.getBuffer())
writer.appendBuffer(compressed.getBuffer())
return {
type,
encodings: ['PLAIN'],
path_in_schema: schemaPath.slice(1).map(s => s.name),
codec: 'UNCOMPRESSED',
codec: 'SNAPPY',
num_values: BigInt(num_values),
total_compressed_size: BigInt(writer.offset - offsetStart),
total_uncompressed_size: BigInt(writer.offset - offsetStart),

@ -29,85 +29,85 @@ export const exampleMetadata = {
type: 'BOOLEAN',
encodings: ['PLAIN'],
path_in_schema: ['bool'],
codec: 'UNCOMPRESSED',
codec: 'SNAPPY',
num_values: 4n,
total_uncompressed_size: 23n,
total_compressed_size: 23n,
total_uncompressed_size: 24n,
total_compressed_size: 24n,
data_page_offset: 4n,
},
},
{
file_path: 'int',
file_offset: 27n,
file_offset: 28n,
meta_data: {
type: 'INT32',
encodings: ['PLAIN'],
path_in_schema: ['int'],
codec: 'UNCOMPRESSED',
codec: 'SNAPPY',
num_values: 4n,
total_uncompressed_size: 38n,
total_compressed_size: 38n,
data_page_offset: 27n,
total_uncompressed_size: 39n,
total_compressed_size: 39n,
data_page_offset: 28n,
},
},
{
file_path: 'bigint',
file_offset: 65n,
file_offset: 67n,
meta_data: {
type: 'INT64',
encodings: ['PLAIN'],
path_in_schema: ['bigint'],
codec: 'UNCOMPRESSED',
codec: 'SNAPPY',
num_values: 4n,
total_uncompressed_size: 54n,
total_compressed_size: 54n,
data_page_offset: 65n,
total_uncompressed_size: 43n,
total_compressed_size: 43n,
data_page_offset: 67n,
},
},
{
file_path: 'double',
file_offset: 119n,
file_offset: 110n,
meta_data: {
type: 'DOUBLE',
encodings: ['PLAIN'],
path_in_schema: ['double'],
codec: 'UNCOMPRESSED',
codec: 'SNAPPY',
num_values: 4n,
total_uncompressed_size: 54n,
total_compressed_size: 54n,
data_page_offset: 119n,
total_uncompressed_size: 51n,
total_compressed_size: 51n,
data_page_offset: 110n,
},
},
{
file_path: 'string',
file_offset: 173n,
file_offset: 161n,
meta_data: {
type: 'BYTE_ARRAY',
encodings: ['PLAIN'],
path_in_schema: ['string'],
codec: 'UNCOMPRESSED',
codec: 'SNAPPY',
num_values: 4n,
total_uncompressed_size: 42n,
total_compressed_size: 42n,
data_page_offset: 173n,
data_page_offset: 161n,
},
},
{
file_path: 'nullable',
file_offset: 215n,
file_offset: 203n,
meta_data: {
type: 'BOOLEAN',
encodings: ['PLAIN'],
path_in_schema: ['nullable'],
codec: 'UNCOMPRESSED',
codec: 'SNAPPY',
num_values: 4n,
total_uncompressed_size: 25n,
total_compressed_size: 25n,
data_page_offset: 215n,
total_uncompressed_size: 26n,
total_compressed_size: 26n,
data_page_offset: 203n,
},
},
],
total_byte_size: 236n,
total_byte_size: 225n,
num_rows: 4n,
}],
metadata_length: 338,

@ -48,7 +48,7 @@ describe('parquetWrite', () => {
bool[500] = true
bool[9999] = false
const file = parquetWrite([{ name: 'bool', data: bool }])
expect(file.byteLength).toBe(147)
expect(file.byteLength).toBe(148)
const metadata = parquetMetadata(file)
expect(metadata.metadata_length).toBe(86)
const result = await parquetReadObjects({ file })
@ -64,7 +64,7 @@ describe('parquetWrite', () => {
it('efficiently serializes long string', () => {
const str = 'a'.repeat(10000)
const file = parquetWrite([{ name: 'string', data: [str] }])
expect(file.byteLength).toBe(10136)
expect(file.byteLength).toBe(606)
})
it('serializes list types', async () => {