diff --git a/README.md b/README.md index e69de29..cda39ed 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,3 @@ +# Hyparquet Writer + +[![mit license](https://img.shields.io/badge/License-MIT-orange.svg)](https://opensource.org/licenses/MIT) diff --git a/src/encoding.js b/src/encoding.js index dc154de..3909397 100644 --- a/src/encoding.js +++ b/src/encoding.js @@ -3,13 +3,16 @@ * @import {Writer} from './writer.js' * @param {Writer} writer * @param {number[]} values + * @returns {number} bytes written */ export function writeRleBitPackedHybrid(writer, values) { + const offsetStart = writer.offset // find max bitwidth const bitWidth = Math.ceil(Math.log2(Math.max(...values) + 1)) // TODO: Try both RLE and bit-packed and choose the best writeBitPacked(writer, values, bitWidth) + return writer.offset - offsetStart } /** diff --git a/src/metadata.js b/src/metadata.js index a6aa0e0..b898ada 100644 --- a/src/metadata.js +++ b/src/metadata.js @@ -1,4 +1,4 @@ -import { Encoding, ParquetType } from 'hyparquet/src/constants.js' +import { Encoding, FieldRepetitionType, ParquetType } from 'hyparquet/src/constants.js' import { serializeTCompactProtocol } from './thrift.js' const CompressionCodec = [ @@ -24,7 +24,7 @@ export function writeMetadata(writer, metadata) { field_2: metadata.schema && metadata.schema.map(element => ({ field_1: element.type && ParquetType.indexOf(element.type), field_2: element.type_length, - field_3: element.repetition_type, + field_3: element.repetition_type && FieldRepetitionType.indexOf(element.repetition_type), field_4: element.name, field_5: element.num_children, field_6: element.converted_type, diff --git a/src/plain.js b/src/plain.js index 9845adc..de179a5 100644 --- a/src/plain.js +++ b/src/plain.js @@ -15,6 +15,8 @@ export function writePlain(writer, values, type) { writePlainInt64(writer, values) } else if (type === 'DOUBLE') { writePlainDouble(writer, values) + } else if (type === 'BYTE_ARRAY') { + writePlainByteArray(writer, values) } else { throw new Error(`parquet unsupported type: ${type}`) } @@ -76,3 +78,15 @@ function writePlainDouble(writer, values) { writer.appendFloat64(value) } } + +/** + * @param {Writer} writer + * @param {DecodedArray} values + */ +function writePlainByteArray(writer, values) { + for (const value of values) { + const bytes = new TextEncoder().encode(value) + writer.appendUint32(bytes.length) + writer.appendBytes(bytes) + } +} diff --git a/src/types.d.ts b/src/types.d.ts index fd39f58..b3dabb5 100644 --- a/src/types.d.ts +++ b/src/types.d.ts @@ -9,6 +9,7 @@ export interface Writer { appendInt64(value: bigint): void appendFloat64(value: number): void appendBuffer(buffer: ArrayBuffer): void + appendBytes(value: Uint8Array): void appendVarInt(value: number): void appendVarBigInt(value: bigint): void } diff --git a/src/writer.js b/src/writer.js index 78717f4..e0e1749 100644 --- a/src/writer.js +++ b/src/writer.js @@ -78,9 +78,16 @@ Writer.prototype.appendFloat64 = function(value) { * @param {ArrayBuffer} value */ Writer.prototype.appendBuffer = function(value) { - this.ensure(this.offset + value.byteLength) - new Uint8Array(this.buffer, this.offset, value.byteLength).set(new Uint8Array(value)) - this.offset += value.byteLength + this.appendBytes(new Uint8Array(value)) +} + +/** + * @param {Uint8Array} value + */ +Writer.prototype.appendBytes = function(value) { + this.ensure(this.offset + value.length) + new Uint8Array(this.buffer, this.offset, value.length).set(value) + this.offset += value.length } /** diff --git a/test/metadata.test.js b/test/metadata.test.js index 33def8e..9b70ae1 100644 --- a/test/metadata.test.js +++ b/test/metadata.test.js @@ -11,76 +11,91 @@ export const exampleMetadata = { version: 2, created_by: 'hyparquet', schema: [ - { name: 'root', num_children: 4 }, - { name: 'bool', type: 'BOOLEAN' }, - { name: 'int', type: 'INT32' }, - { name: 'bigint', type: 'INT64' }, - { name: 'double', type: 'DOUBLE' }, + { name: 'root', num_children: 5, repetition_type: 'REQUIRED' }, + { name: 'bool', type: 'BOOLEAN', repetition_type: 'REQUIRED' }, + { name: 'int', type: 'INT32', repetition_type: 'REQUIRED' }, + { name: 'bigint', type: 'INT64', repetition_type: 'REQUIRED' }, + { name: 'double', type: 'DOUBLE', repetition_type: 'REQUIRED' }, + { name: 'string', type: 'BYTE_ARRAY', repetition_type: 'REQUIRED' }, ], num_rows: 4n, row_groups: [{ columns: [ { file_path: 'bool', - file_offset: 32n, + file_offset: 4n, meta_data: { type: 'BOOLEAN', encodings: ['PLAIN'], path_in_schema: ['bool'], codec: 'UNCOMPRESSED', num_values: 4n, - total_uncompressed_size: 28n, - total_compressed_size: 28n, + total_uncompressed_size: 23n, + total_compressed_size: 23n, data_page_offset: 4n, }, }, { file_path: 'int', - file_offset: 75n, + file_offset: 27n, meta_data: { type: 'INT32', encodings: ['PLAIN'], path_in_schema: ['int'], codec: 'UNCOMPRESSED', num_values: 4n, - total_uncompressed_size: 43n, - total_compressed_size: 43n, - data_page_offset: 32n, + total_uncompressed_size: 38n, + total_compressed_size: 38n, + data_page_offset: 27n, }, }, { file_path: 'bigint', - file_offset: 134n, + file_offset: 65n, meta_data: { type: 'INT64', encodings: ['PLAIN'], path_in_schema: ['bigint'], codec: 'UNCOMPRESSED', num_values: 4n, - total_uncompressed_size: 59n, - total_compressed_size: 59n, - data_page_offset: 75n, + total_uncompressed_size: 54n, + total_compressed_size: 54n, + data_page_offset: 65n, }, }, { file_path: 'double', - file_offset: 193n, + file_offset: 119n, meta_data: { type: 'DOUBLE', encodings: ['PLAIN'], path_in_schema: ['double'], codec: 'UNCOMPRESSED', num_values: 4n, - total_uncompressed_size: 59n, - total_compressed_size: 59n, - data_page_offset: 134n, + total_uncompressed_size: 54n, + total_compressed_size: 54n, + data_page_offset: 119n, + }, + }, + { + file_path: 'string', + file_offset: 173n, + meta_data: { + type: 'BYTE_ARRAY', + encodings: ['PLAIN'], + path_in_schema: ['string'], + codec: 'UNCOMPRESSED', + num_values: 4n, + total_uncompressed_size: 42n, + total_compressed_size: 42n, + data_page_offset: 173n, }, }, ], - total_byte_size: 189n, + total_byte_size: 211n, num_rows: 4n, }], - metadata_length: 219, + metadata_length: 280, } describe('writeMetadata', () => { diff --git a/test/plain.test.js b/test/plain.test.js index 60fbe80..0e9b0c7 100644 --- a/test/plain.test.js +++ b/test/plain.test.js @@ -59,9 +59,27 @@ describe('writePlain', () => { } }) + it('writes BYTE_ARRAY', () => { + const writer = new Writer() + const strings = ['a', 'b', 'c', 'd'] + writePlain(writer, strings, 'BYTE_ARRAY') + + let offset = 0 + for (const s of strings) { + const length = writer.view.getUint32(offset, true) + expect(length).toBe(s.length) + offset += 4 + + for (let i = 0; i < s.length; i++) { + expect(writer.view.getUint8(offset)).toBe(s.charCodeAt(i)) + offset += 1 + } + } + }) + it('throws error on unsupported type', () => { const writer = new Writer() - expect(() => writePlain(writer, [1, 2, 3], 'BYTE_ARRAY')) + expect(() => writePlain(writer, [1, 2, 3], 'INT96')) .toThrow(/parquet unsupported type/i) }) })