mirror of
https://github.com/asadbek064/hyparquet-writer.git
synced 2026-01-11 21:36:37 +00:00
Plain string support
This commit is contained in:
parent
c7d84e0e9d
commit
c20356087b
@ -0,0 +1,3 @@
|
||||
# Hyparquet Writer
|
||||
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
@ -3,13 +3,16 @@
|
||||
* @import {Writer} from './writer.js'
|
||||
* @param {Writer} writer
|
||||
* @param {number[]} values
|
||||
* @returns {number} bytes written
|
||||
*/
|
||||
export function writeRleBitPackedHybrid(writer, values) {
|
||||
const offsetStart = writer.offset
|
||||
// find max bitwidth
|
||||
const bitWidth = Math.ceil(Math.log2(Math.max(...values) + 1))
|
||||
|
||||
// TODO: Try both RLE and bit-packed and choose the best
|
||||
writeBitPacked(writer, values, bitWidth)
|
||||
return writer.offset - offsetStart
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
import { Encoding, ParquetType } from 'hyparquet/src/constants.js'
|
||||
import { Encoding, FieldRepetitionType, ParquetType } from 'hyparquet/src/constants.js'
|
||||
import { serializeTCompactProtocol } from './thrift.js'
|
||||
|
||||
const CompressionCodec = [
|
||||
@ -24,7 +24,7 @@ export function writeMetadata(writer, metadata) {
|
||||
field_2: metadata.schema && metadata.schema.map(element => ({
|
||||
field_1: element.type && ParquetType.indexOf(element.type),
|
||||
field_2: element.type_length,
|
||||
field_3: element.repetition_type,
|
||||
field_3: element.repetition_type && FieldRepetitionType.indexOf(element.repetition_type),
|
||||
field_4: element.name,
|
||||
field_5: element.num_children,
|
||||
field_6: element.converted_type,
|
||||
|
||||
14
src/plain.js
14
src/plain.js
@ -15,6 +15,8 @@ export function writePlain(writer, values, type) {
|
||||
writePlainInt64(writer, values)
|
||||
} else if (type === 'DOUBLE') {
|
||||
writePlainDouble(writer, values)
|
||||
} else if (type === 'BYTE_ARRAY') {
|
||||
writePlainByteArray(writer, values)
|
||||
} else {
|
||||
throw new Error(`parquet unsupported type: ${type}`)
|
||||
}
|
||||
@ -76,3 +78,15 @@ function writePlainDouble(writer, values) {
|
||||
writer.appendFloat64(value)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {Writer} writer
|
||||
* @param {DecodedArray} values
|
||||
*/
|
||||
function writePlainByteArray(writer, values) {
|
||||
for (const value of values) {
|
||||
const bytes = new TextEncoder().encode(value)
|
||||
writer.appendUint32(bytes.length)
|
||||
writer.appendBytes(bytes)
|
||||
}
|
||||
}
|
||||
|
||||
1
src/types.d.ts
vendored
1
src/types.d.ts
vendored
@ -9,6 +9,7 @@ export interface Writer {
|
||||
appendInt64(value: bigint): void
|
||||
appendFloat64(value: number): void
|
||||
appendBuffer(buffer: ArrayBuffer): void
|
||||
appendBytes(value: Uint8Array): void
|
||||
appendVarInt(value: number): void
|
||||
appendVarBigInt(value: bigint): void
|
||||
}
|
||||
|
||||
@ -78,9 +78,16 @@ Writer.prototype.appendFloat64 = function(value) {
|
||||
* @param {ArrayBuffer} value
|
||||
*/
|
||||
Writer.prototype.appendBuffer = function(value) {
|
||||
this.ensure(this.offset + value.byteLength)
|
||||
new Uint8Array(this.buffer, this.offset, value.byteLength).set(new Uint8Array(value))
|
||||
this.offset += value.byteLength
|
||||
this.appendBytes(new Uint8Array(value))
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {Uint8Array} value
|
||||
*/
|
||||
Writer.prototype.appendBytes = function(value) {
|
||||
this.ensure(this.offset + value.length)
|
||||
new Uint8Array(this.buffer, this.offset, value.length).set(value)
|
||||
this.offset += value.length
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@ -11,76 +11,91 @@ export const exampleMetadata = {
|
||||
version: 2,
|
||||
created_by: 'hyparquet',
|
||||
schema: [
|
||||
{ name: 'root', num_children: 4 },
|
||||
{ name: 'bool', type: 'BOOLEAN' },
|
||||
{ name: 'int', type: 'INT32' },
|
||||
{ name: 'bigint', type: 'INT64' },
|
||||
{ name: 'double', type: 'DOUBLE' },
|
||||
{ name: 'root', num_children: 5, repetition_type: 'REQUIRED' },
|
||||
{ name: 'bool', type: 'BOOLEAN', repetition_type: 'REQUIRED' },
|
||||
{ name: 'int', type: 'INT32', repetition_type: 'REQUIRED' },
|
||||
{ name: 'bigint', type: 'INT64', repetition_type: 'REQUIRED' },
|
||||
{ name: 'double', type: 'DOUBLE', repetition_type: 'REQUIRED' },
|
||||
{ name: 'string', type: 'BYTE_ARRAY', repetition_type: 'REQUIRED' },
|
||||
],
|
||||
num_rows: 4n,
|
||||
row_groups: [{
|
||||
columns: [
|
||||
{
|
||||
file_path: 'bool',
|
||||
file_offset: 32n,
|
||||
file_offset: 4n,
|
||||
meta_data: {
|
||||
type: 'BOOLEAN',
|
||||
encodings: ['PLAIN'],
|
||||
path_in_schema: ['bool'],
|
||||
codec: 'UNCOMPRESSED',
|
||||
num_values: 4n,
|
||||
total_uncompressed_size: 28n,
|
||||
total_compressed_size: 28n,
|
||||
total_uncompressed_size: 23n,
|
||||
total_compressed_size: 23n,
|
||||
data_page_offset: 4n,
|
||||
},
|
||||
},
|
||||
{
|
||||
file_path: 'int',
|
||||
file_offset: 75n,
|
||||
file_offset: 27n,
|
||||
meta_data: {
|
||||
type: 'INT32',
|
||||
encodings: ['PLAIN'],
|
||||
path_in_schema: ['int'],
|
||||
codec: 'UNCOMPRESSED',
|
||||
num_values: 4n,
|
||||
total_uncompressed_size: 43n,
|
||||
total_compressed_size: 43n,
|
||||
data_page_offset: 32n,
|
||||
total_uncompressed_size: 38n,
|
||||
total_compressed_size: 38n,
|
||||
data_page_offset: 27n,
|
||||
},
|
||||
},
|
||||
{
|
||||
file_path: 'bigint',
|
||||
file_offset: 134n,
|
||||
file_offset: 65n,
|
||||
meta_data: {
|
||||
type: 'INT64',
|
||||
encodings: ['PLAIN'],
|
||||
path_in_schema: ['bigint'],
|
||||
codec: 'UNCOMPRESSED',
|
||||
num_values: 4n,
|
||||
total_uncompressed_size: 59n,
|
||||
total_compressed_size: 59n,
|
||||
data_page_offset: 75n,
|
||||
total_uncompressed_size: 54n,
|
||||
total_compressed_size: 54n,
|
||||
data_page_offset: 65n,
|
||||
},
|
||||
},
|
||||
{
|
||||
file_path: 'double',
|
||||
file_offset: 193n,
|
||||
file_offset: 119n,
|
||||
meta_data: {
|
||||
type: 'DOUBLE',
|
||||
encodings: ['PLAIN'],
|
||||
path_in_schema: ['double'],
|
||||
codec: 'UNCOMPRESSED',
|
||||
num_values: 4n,
|
||||
total_uncompressed_size: 59n,
|
||||
total_compressed_size: 59n,
|
||||
data_page_offset: 134n,
|
||||
total_uncompressed_size: 54n,
|
||||
total_compressed_size: 54n,
|
||||
data_page_offset: 119n,
|
||||
},
|
||||
},
|
||||
{
|
||||
file_path: 'string',
|
||||
file_offset: 173n,
|
||||
meta_data: {
|
||||
type: 'BYTE_ARRAY',
|
||||
encodings: ['PLAIN'],
|
||||
path_in_schema: ['string'],
|
||||
codec: 'UNCOMPRESSED',
|
||||
num_values: 4n,
|
||||
total_uncompressed_size: 42n,
|
||||
total_compressed_size: 42n,
|
||||
data_page_offset: 173n,
|
||||
},
|
||||
},
|
||||
],
|
||||
total_byte_size: 189n,
|
||||
total_byte_size: 211n,
|
||||
num_rows: 4n,
|
||||
}],
|
||||
metadata_length: 219,
|
||||
metadata_length: 280,
|
||||
}
|
||||
|
||||
describe('writeMetadata', () => {
|
||||
|
||||
@ -59,9 +59,27 @@ describe('writePlain', () => {
|
||||
}
|
||||
})
|
||||
|
||||
it('writes BYTE_ARRAY', () => {
|
||||
const writer = new Writer()
|
||||
const strings = ['a', 'b', 'c', 'd']
|
||||
writePlain(writer, strings, 'BYTE_ARRAY')
|
||||
|
||||
let offset = 0
|
||||
for (const s of strings) {
|
||||
const length = writer.view.getUint32(offset, true)
|
||||
expect(length).toBe(s.length)
|
||||
offset += 4
|
||||
|
||||
for (let i = 0; i < s.length; i++) {
|
||||
expect(writer.view.getUint8(offset)).toBe(s.charCodeAt(i))
|
||||
offset += 1
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
it('throws error on unsupported type', () => {
|
||||
const writer = new Writer()
|
||||
expect(() => writePlain(writer, [1, 2, 3], 'BYTE_ARRAY'))
|
||||
expect(() => writePlain(writer, [1, 2, 3], 'INT96'))
|
||||
.toThrow(/parquet unsupported type/i)
|
||||
})
|
||||
})
|
||||
|
||||
Loading…
Reference in New Issue
Block a user