Plain string support

This commit is contained in:
Kenny Daniel 2025-03-25 20:15:14 -07:00
parent c7d84e0e9d
commit c20356087b
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
8 changed files with 89 additions and 28 deletions

@ -0,0 +1,3 @@
# Hyparquet Writer
[![mit license](https://img.shields.io/badge/License-MIT-orange.svg)](https://opensource.org/licenses/MIT)

@ -3,13 +3,16 @@
* @import {Writer} from './writer.js'
* @param {Writer} writer
* @param {number[]} values
* @returns {number} bytes written
*/
export function writeRleBitPackedHybrid(writer, values) {
const offsetStart = writer.offset
// find max bitwidth
const bitWidth = Math.ceil(Math.log2(Math.max(...values) + 1))
// TODO: Try both RLE and bit-packed and choose the best
writeBitPacked(writer, values, bitWidth)
return writer.offset - offsetStart
}
/**

@ -1,4 +1,4 @@
import { Encoding, ParquetType } from 'hyparquet/src/constants.js'
import { Encoding, FieldRepetitionType, ParquetType } from 'hyparquet/src/constants.js'
import { serializeTCompactProtocol } from './thrift.js'
const CompressionCodec = [
@ -24,7 +24,7 @@ export function writeMetadata(writer, metadata) {
field_2: metadata.schema && metadata.schema.map(element => ({
field_1: element.type && ParquetType.indexOf(element.type),
field_2: element.type_length,
field_3: element.repetition_type,
field_3: element.repetition_type && FieldRepetitionType.indexOf(element.repetition_type),
field_4: element.name,
field_5: element.num_children,
field_6: element.converted_type,

@ -15,6 +15,8 @@ export function writePlain(writer, values, type) {
writePlainInt64(writer, values)
} else if (type === 'DOUBLE') {
writePlainDouble(writer, values)
} else if (type === 'BYTE_ARRAY') {
writePlainByteArray(writer, values)
} else {
throw new Error(`parquet unsupported type: ${type}`)
}
@ -76,3 +78,15 @@ function writePlainDouble(writer, values) {
writer.appendFloat64(value)
}
}
/**
* @param {Writer} writer
* @param {DecodedArray} values
*/
function writePlainByteArray(writer, values) {
for (const value of values) {
const bytes = new TextEncoder().encode(value)
writer.appendUint32(bytes.length)
writer.appendBytes(bytes)
}
}

1
src/types.d.ts vendored

@ -9,6 +9,7 @@ export interface Writer {
appendInt64(value: bigint): void
appendFloat64(value: number): void
appendBuffer(buffer: ArrayBuffer): void
appendBytes(value: Uint8Array): void
appendVarInt(value: number): void
appendVarBigInt(value: bigint): void
}

@ -78,9 +78,16 @@ Writer.prototype.appendFloat64 = function(value) {
* @param {ArrayBuffer} value
*/
Writer.prototype.appendBuffer = function(value) {
this.ensure(this.offset + value.byteLength)
new Uint8Array(this.buffer, this.offset, value.byteLength).set(new Uint8Array(value))
this.offset += value.byteLength
this.appendBytes(new Uint8Array(value))
}
/**
* @param {Uint8Array} value
*/
Writer.prototype.appendBytes = function(value) {
this.ensure(this.offset + value.length)
new Uint8Array(this.buffer, this.offset, value.length).set(value)
this.offset += value.length
}
/**

@ -11,76 +11,91 @@ export const exampleMetadata = {
version: 2,
created_by: 'hyparquet',
schema: [
{ name: 'root', num_children: 4 },
{ name: 'bool', type: 'BOOLEAN' },
{ name: 'int', type: 'INT32' },
{ name: 'bigint', type: 'INT64' },
{ name: 'double', type: 'DOUBLE' },
{ name: 'root', num_children: 5, repetition_type: 'REQUIRED' },
{ name: 'bool', type: 'BOOLEAN', repetition_type: 'REQUIRED' },
{ name: 'int', type: 'INT32', repetition_type: 'REQUIRED' },
{ name: 'bigint', type: 'INT64', repetition_type: 'REQUIRED' },
{ name: 'double', type: 'DOUBLE', repetition_type: 'REQUIRED' },
{ name: 'string', type: 'BYTE_ARRAY', repetition_type: 'REQUIRED' },
],
num_rows: 4n,
row_groups: [{
columns: [
{
file_path: 'bool',
file_offset: 32n,
file_offset: 4n,
meta_data: {
type: 'BOOLEAN',
encodings: ['PLAIN'],
path_in_schema: ['bool'],
codec: 'UNCOMPRESSED',
num_values: 4n,
total_uncompressed_size: 28n,
total_compressed_size: 28n,
total_uncompressed_size: 23n,
total_compressed_size: 23n,
data_page_offset: 4n,
},
},
{
file_path: 'int',
file_offset: 75n,
file_offset: 27n,
meta_data: {
type: 'INT32',
encodings: ['PLAIN'],
path_in_schema: ['int'],
codec: 'UNCOMPRESSED',
num_values: 4n,
total_uncompressed_size: 43n,
total_compressed_size: 43n,
data_page_offset: 32n,
total_uncompressed_size: 38n,
total_compressed_size: 38n,
data_page_offset: 27n,
},
},
{
file_path: 'bigint',
file_offset: 134n,
file_offset: 65n,
meta_data: {
type: 'INT64',
encodings: ['PLAIN'],
path_in_schema: ['bigint'],
codec: 'UNCOMPRESSED',
num_values: 4n,
total_uncompressed_size: 59n,
total_compressed_size: 59n,
data_page_offset: 75n,
total_uncompressed_size: 54n,
total_compressed_size: 54n,
data_page_offset: 65n,
},
},
{
file_path: 'double',
file_offset: 193n,
file_offset: 119n,
meta_data: {
type: 'DOUBLE',
encodings: ['PLAIN'],
path_in_schema: ['double'],
codec: 'UNCOMPRESSED',
num_values: 4n,
total_uncompressed_size: 59n,
total_compressed_size: 59n,
data_page_offset: 134n,
total_uncompressed_size: 54n,
total_compressed_size: 54n,
data_page_offset: 119n,
},
},
{
file_path: 'string',
file_offset: 173n,
meta_data: {
type: 'BYTE_ARRAY',
encodings: ['PLAIN'],
path_in_schema: ['string'],
codec: 'UNCOMPRESSED',
num_values: 4n,
total_uncompressed_size: 42n,
total_compressed_size: 42n,
data_page_offset: 173n,
},
},
],
total_byte_size: 189n,
total_byte_size: 211n,
num_rows: 4n,
}],
metadata_length: 219,
metadata_length: 280,
}
describe('writeMetadata', () => {

@ -59,9 +59,27 @@ describe('writePlain', () => {
}
})
it('writes BYTE_ARRAY', () => {
const writer = new Writer()
const strings = ['a', 'b', 'c', 'd']
writePlain(writer, strings, 'BYTE_ARRAY')
let offset = 0
for (const s of strings) {
const length = writer.view.getUint32(offset, true)
expect(length).toBe(s.length)
offset += 4
for (let i = 0; i < s.length; i++) {
expect(writer.view.getUint8(offset)).toBe(s.charCodeAt(i))
offset += 1
}
}
})
it('throws error on unsupported type', () => {
const writer = new Writer()
expect(() => writePlain(writer, [1, 2, 3], 'BYTE_ARRAY'))
expect(() => writePlain(writer, [1, 2, 3], 'INT96'))
.toThrow(/parquet unsupported type/i)
})
})