mirror of
https://github.com/asadbek064/hyparquet-writer.git
synced 2025-12-05 23:31:54 +00:00
Use snappy compressed pages
This commit is contained in:
parent
aca7c2bde4
commit
69d373ad61
@ -3,7 +3,7 @@
|
||||
[](https://www.npmjs.com/package/hyparquet-writer)
|
||||
[](https://github.com/hyparam/hyparquet-writer/actions)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||

|
||||

|
||||
[](https://www.npmjs.com/package/hyparquet?activeTab=dependencies)
|
||||
|
||||
## Usage
|
||||
|
||||
@ -3,6 +3,7 @@ import { unconvert } from './convert.js'
|
||||
import { writeRleBitPackedHybrid } from './encoding.js'
|
||||
import { writePlain } from './plain.js'
|
||||
import { getMaxDefinitionLevel, getMaxRepetitionLevel } from './schema.js'
|
||||
import { snappyCompress } from './snappy.js'
|
||||
import { serializeTCompactProtocol } from './thrift.js'
|
||||
import { Writer } from './writer.js'
|
||||
|
||||
@ -55,14 +56,16 @@ export function writeColumn(writer, schemaPath, values) {
|
||||
const page = new Writer()
|
||||
writePageData(page, values, type)
|
||||
|
||||
// TODO: compress page data
|
||||
// compress page data
|
||||
const compressed = new Writer()
|
||||
snappyCompress(compressed, new Uint8Array(page.getBuffer()))
|
||||
|
||||
// write page header
|
||||
/** @type {PageHeader} */
|
||||
const header = {
|
||||
type: 'DATA_PAGE_V2',
|
||||
uncompressed_page_size: levels.offset + page.offset,
|
||||
compressed_page_size: levels.offset + page.offset,
|
||||
compressed_page_size: levels.offset + compressed.offset,
|
||||
data_page_header_v2: {
|
||||
num_values,
|
||||
num_nulls,
|
||||
@ -70,7 +73,7 @@ export function writeColumn(writer, schemaPath, values) {
|
||||
encoding: 'PLAIN',
|
||||
definition_levels_byte_length,
|
||||
repetition_levels_byte_length,
|
||||
is_compressed: false,
|
||||
is_compressed: true,
|
||||
},
|
||||
}
|
||||
writePageHeader(writer, header)
|
||||
@ -79,13 +82,13 @@ export function writeColumn(writer, schemaPath, values) {
|
||||
writer.appendBuffer(levels.getBuffer())
|
||||
|
||||
// write page data
|
||||
writer.appendBuffer(page.getBuffer())
|
||||
writer.appendBuffer(compressed.getBuffer())
|
||||
|
||||
return {
|
||||
type,
|
||||
encodings: ['PLAIN'],
|
||||
path_in_schema: schemaPath.slice(1).map(s => s.name),
|
||||
codec: 'UNCOMPRESSED',
|
||||
codec: 'SNAPPY',
|
||||
num_values: BigInt(num_values),
|
||||
total_compressed_size: BigInt(writer.offset - offsetStart),
|
||||
total_uncompressed_size: BigInt(writer.offset - offsetStart),
|
||||
|
||||
@ -29,85 +29,85 @@ export const exampleMetadata = {
|
||||
type: 'BOOLEAN',
|
||||
encodings: ['PLAIN'],
|
||||
path_in_schema: ['bool'],
|
||||
codec: 'UNCOMPRESSED',
|
||||
codec: 'SNAPPY',
|
||||
num_values: 4n,
|
||||
total_uncompressed_size: 23n,
|
||||
total_compressed_size: 23n,
|
||||
total_uncompressed_size: 24n,
|
||||
total_compressed_size: 24n,
|
||||
data_page_offset: 4n,
|
||||
},
|
||||
},
|
||||
{
|
||||
file_path: 'int',
|
||||
file_offset: 27n,
|
||||
file_offset: 28n,
|
||||
meta_data: {
|
||||
type: 'INT32',
|
||||
encodings: ['PLAIN'],
|
||||
path_in_schema: ['int'],
|
||||
codec: 'UNCOMPRESSED',
|
||||
codec: 'SNAPPY',
|
||||
num_values: 4n,
|
||||
total_uncompressed_size: 38n,
|
||||
total_compressed_size: 38n,
|
||||
data_page_offset: 27n,
|
||||
total_uncompressed_size: 39n,
|
||||
total_compressed_size: 39n,
|
||||
data_page_offset: 28n,
|
||||
},
|
||||
},
|
||||
{
|
||||
file_path: 'bigint',
|
||||
file_offset: 65n,
|
||||
file_offset: 67n,
|
||||
meta_data: {
|
||||
type: 'INT64',
|
||||
encodings: ['PLAIN'],
|
||||
path_in_schema: ['bigint'],
|
||||
codec: 'UNCOMPRESSED',
|
||||
codec: 'SNAPPY',
|
||||
num_values: 4n,
|
||||
total_uncompressed_size: 54n,
|
||||
total_compressed_size: 54n,
|
||||
data_page_offset: 65n,
|
||||
total_uncompressed_size: 43n,
|
||||
total_compressed_size: 43n,
|
||||
data_page_offset: 67n,
|
||||
},
|
||||
},
|
||||
{
|
||||
file_path: 'double',
|
||||
file_offset: 119n,
|
||||
file_offset: 110n,
|
||||
meta_data: {
|
||||
type: 'DOUBLE',
|
||||
encodings: ['PLAIN'],
|
||||
path_in_schema: ['double'],
|
||||
codec: 'UNCOMPRESSED',
|
||||
codec: 'SNAPPY',
|
||||
num_values: 4n,
|
||||
total_uncompressed_size: 54n,
|
||||
total_compressed_size: 54n,
|
||||
data_page_offset: 119n,
|
||||
total_uncompressed_size: 51n,
|
||||
total_compressed_size: 51n,
|
||||
data_page_offset: 110n,
|
||||
},
|
||||
},
|
||||
{
|
||||
file_path: 'string',
|
||||
file_offset: 173n,
|
||||
file_offset: 161n,
|
||||
meta_data: {
|
||||
type: 'BYTE_ARRAY',
|
||||
encodings: ['PLAIN'],
|
||||
path_in_schema: ['string'],
|
||||
codec: 'UNCOMPRESSED',
|
||||
codec: 'SNAPPY',
|
||||
num_values: 4n,
|
||||
total_uncompressed_size: 42n,
|
||||
total_compressed_size: 42n,
|
||||
data_page_offset: 173n,
|
||||
data_page_offset: 161n,
|
||||
},
|
||||
},
|
||||
{
|
||||
file_path: 'nullable',
|
||||
file_offset: 215n,
|
||||
file_offset: 203n,
|
||||
meta_data: {
|
||||
type: 'BOOLEAN',
|
||||
encodings: ['PLAIN'],
|
||||
path_in_schema: ['nullable'],
|
||||
codec: 'UNCOMPRESSED',
|
||||
codec: 'SNAPPY',
|
||||
num_values: 4n,
|
||||
total_uncompressed_size: 25n,
|
||||
total_compressed_size: 25n,
|
||||
data_page_offset: 215n,
|
||||
total_uncompressed_size: 26n,
|
||||
total_compressed_size: 26n,
|
||||
data_page_offset: 203n,
|
||||
},
|
||||
},
|
||||
],
|
||||
total_byte_size: 236n,
|
||||
total_byte_size: 225n,
|
||||
num_rows: 4n,
|
||||
}],
|
||||
metadata_length: 338,
|
||||
|
||||
@ -48,7 +48,7 @@ describe('parquetWrite', () => {
|
||||
bool[500] = true
|
||||
bool[9999] = false
|
||||
const file = parquetWrite([{ name: 'bool', data: bool }])
|
||||
expect(file.byteLength).toBe(147)
|
||||
expect(file.byteLength).toBe(148)
|
||||
const metadata = parquetMetadata(file)
|
||||
expect(metadata.metadata_length).toBe(86)
|
||||
const result = await parquetReadObjects({ file })
|
||||
@ -64,7 +64,7 @@ describe('parquetWrite', () => {
|
||||
it('efficiently serializes long string', () => {
|
||||
const str = 'a'.repeat(10000)
|
||||
const file = parquetWrite([{ name: 'string', data: [str] }])
|
||||
expect(file.byteLength).toBe(10136)
|
||||
expect(file.byteLength).toBe(606)
|
||||
})
|
||||
|
||||
it('serializes list types', async () => {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user