mirror of
https://github.com/asadbek064/hyparquet-writer.git
synced 2026-01-05 19:16:37 +00:00
Write BitPacked
This commit is contained in:
parent
045256b478
commit
76889680b9
71
src/encoding.js
Normal file
71
src/encoding.js
Normal file
@ -0,0 +1,71 @@
|
||||
|
||||
/**
|
||||
* @import {Writer} from './writer.js'
|
||||
* @param {Writer} writer
|
||||
* @param {number[]} values
|
||||
*/
|
||||
export function writeRleBitPackedHybrid(writer, values) {
|
||||
// find max bitwidth
|
||||
const bitWidth = Math.ceil(Math.log2(Math.max(...values) + 1))
|
||||
|
||||
// TODO: Try both RLE and bit-packed and choose the best
|
||||
writeBitPacked(writer, values, bitWidth)
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {Writer} writer
|
||||
* @param {number[]} values
|
||||
* @param {number} bitWidth
|
||||
*/
|
||||
function writeBitPacked(writer, values, bitWidth) {
|
||||
// Number of 8-value groups
|
||||
const numGroups = Math.ceil(values.length / 8)
|
||||
|
||||
// The parquet bitpack header: lower bit = 1 => "bit-packed mode"
|
||||
// upper bits = number of groups
|
||||
const header = numGroups << 1 | 1
|
||||
|
||||
// Write the header as a varint
|
||||
writer.appendVarInt(header)
|
||||
|
||||
// If bitWidth = 0, no data is actually needed
|
||||
if (bitWidth === 0 || values.length === 0) {
|
||||
return
|
||||
}
|
||||
|
||||
const mask = (1 << bitWidth) - 1
|
||||
let buffer = 0 // accumulates bits
|
||||
let bitsUsed = 0 // how many bits are in 'buffer' so far
|
||||
|
||||
// Write out each value, bit-packing into buffer
|
||||
for (let i = 0; i < values.length; i++) {
|
||||
const v = values[i] & mask // mask off bits exceeding bitWidth
|
||||
buffer |= v << bitsUsed
|
||||
bitsUsed += bitWidth
|
||||
|
||||
// Flush full bytes
|
||||
while (bitsUsed >= 8) {
|
||||
writer.appendUint8(buffer & 0xFF)
|
||||
buffer >>>= 8
|
||||
bitsUsed -= 8
|
||||
}
|
||||
}
|
||||
|
||||
// Pad the final partial group with zeros if needed
|
||||
const totalNeeded = numGroups * 8
|
||||
for (let padCount = values.length; padCount < totalNeeded; padCount++) {
|
||||
// Just write a 0 value into the buffer
|
||||
buffer |= 0 << bitsUsed
|
||||
bitsUsed += bitWidth
|
||||
while (bitsUsed >= 8) {
|
||||
writer.appendUint8(buffer & 0xFF)
|
||||
buffer >>>= 8
|
||||
bitsUsed -= 8
|
||||
}
|
||||
}
|
||||
|
||||
// Flush any remaining bits
|
||||
if (bitsUsed > 0) {
|
||||
writer.appendUint8(buffer & 0xFF)
|
||||
}
|
||||
}
|
||||
1
src/types.d.ts
vendored
1
src/types.d.ts
vendored
@ -2,6 +2,7 @@
|
||||
export interface Writer {
|
||||
buffer: ArrayBuffer
|
||||
offset: number
|
||||
getBuffer(): ArrayBuffer
|
||||
appendUint8(value: number): void
|
||||
appendUint32(value: number): void
|
||||
appendFloat64(value: number): void
|
||||
|
||||
@ -25,6 +25,10 @@ Writer.prototype.ensure = function(size) {
|
||||
}
|
||||
}
|
||||
|
||||
Writer.prototype.getBuffer = function() {
|
||||
return this.buffer.slice(0, this.offset)
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {number} value
|
||||
*/
|
||||
|
||||
59
test/encoding.test.js
Normal file
59
test/encoding.test.js
Normal file
@ -0,0 +1,59 @@
|
||||
import { describe, expect, it } from 'vitest'
|
||||
import { Writer } from '../src/writer.js'
|
||||
import { writeRleBitPackedHybrid } from '../src/encoding.js'
|
||||
import { readRleBitPackedHybrid } from 'hyparquet/src/encoding.js'
|
||||
|
||||
/**
|
||||
* Round-trip serialize and deserialize the given values.
|
||||
*
|
||||
* @param {number[]} values
|
||||
* @returns {number[]}
|
||||
*/
|
||||
function roundTripDeserialize(values) {
|
||||
const bitWidth = Math.ceil(Math.log2(Math.max(...values) + 1))
|
||||
|
||||
// Serialize the values using writeRleBitPackedHybrid
|
||||
const writer = new Writer()
|
||||
writeRleBitPackedHybrid(writer, values)
|
||||
const buffer = writer.getBuffer()
|
||||
const reader = { view: new DataView(buffer), offset: 0 }
|
||||
|
||||
// Decode the values using readRleBitPackedHybrid from hyparquet
|
||||
/** @type {number[]} */
|
||||
const output = new Array(values.length)
|
||||
readRleBitPackedHybrid(reader, bitWidth, values.length, output)
|
||||
return output
|
||||
}
|
||||
|
||||
describe('RLE bit-packed hybrid', () => {
|
||||
it('should round-trip a typical array of values', () => {
|
||||
const original = [1, 2, 3, 4, 5, 6, 7, 8, 9]
|
||||
const decoded = roundTripDeserialize(original)
|
||||
expect(decoded).toEqual(original)
|
||||
})
|
||||
|
||||
it('should round-trip an empty array', () => {
|
||||
const decoded = roundTripDeserialize([])
|
||||
expect(decoded).toEqual([])
|
||||
})
|
||||
|
||||
it('should round-trip an array of zeros', () => {
|
||||
const original = [0, 0, 0, 0, 0, 0, 0, 0]
|
||||
const decoded = roundTripDeserialize(original)
|
||||
expect(decoded).toEqual(original)
|
||||
})
|
||||
|
||||
it('should round-trip an array with large numbers', () => {
|
||||
const original = [1023, 511, 255, 127, 63, 31, 15, 7]
|
||||
const decoded = roundTripDeserialize(original)
|
||||
expect(decoded).toEqual(original)
|
||||
})
|
||||
|
||||
it('should round-trip a random array of values', () => {
|
||||
const original = Array.from({ length: 20 }, () =>
|
||||
Math.floor(Math.random() * 1000)
|
||||
)
|
||||
const decoded = roundTripDeserialize(original)
|
||||
expect(decoded).toEqual(original)
|
||||
})
|
||||
})
|
||||
Loading…
Reference in New Issue
Block a user