Pass bitWidth to writeRleBitPackedHybrid to avoid re-scanning data

This commit is contained in:
Kenny Daniel 2025-04-20 19:20:49 -07:00
parent 12a12edfe8
commit 5a3d6e8d3f
No known key found for this signature in database
GPG Key ID: 90AB653A8CAD7E45
4 changed files with 14 additions and 16 deletions

@ -1,7 +1,7 @@
import { createWriteStream, promises as fs } from 'fs'
import { pipeline } from 'stream/promises'
import { asyncBufferFromFile, parquetMetadataAsync, parquetReadObjects, parquetSchema } from 'hyparquet'
import { parquetWriteFile } from './src/write.js'
import { parquetWriteFile } from './src/node.js'
const url = 'https://s3.hyperparam.app/tpch-lineitem-v2.parquet'
const filename = 'data/tpch-lineitem-v2.parquet'
@ -28,7 +28,7 @@ const metadata = await parquetMetadataAsync(file)
const rows = await parquetReadObjects({
file,
metadata,
columns: ['l_comment'],
// columns: ['l_comment'],
rowStart: 0,
rowEnd: 100_000,
})
@ -42,7 +42,7 @@ const columnData = schema.children.map(({ element }) => ({
// type: element.type,
...element,
data: [],
})).filter(({ name }) => name === 'l_comment')
})) // .filter(({ name }) => name === 'l_comment')
for (const row of rows) {
for (const { name, data } of columnData) {
data.push(row[name])

@ -28,10 +28,12 @@ export function writeDataPageV2(writer, values, type, schemaPath, encoding, comp
// write page data to temp buffer
const page = new ByteWriter()
if (encoding === 'RLE_DICTIONARY') {
const maxValue = Math.max(...nonnull)
// find max bitwidth
let maxValue = 0
for (const v of values) if (v > maxValue) maxValue = v
const bitWidth = Math.ceil(Math.log2(maxValue + 1))
page.appendUint8(bitWidth)
writeRleBitPackedHybrid(page, nonnull)
page.appendUint8(bitWidth) // prepend bitWidth
writeRleBitPackedHybrid(page, nonnull, bitWidth)
} else {
writePlain(page, nonnull, type)
}
@ -115,7 +117,7 @@ function writeLevels(writer, schemaPath, values) {
const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath)
let repetition_levels_byte_length = 0
if (maxRepetitionLevel) {
repetition_levels_byte_length = writeRleBitPackedHybrid(writer, [])
repetition_levels_byte_length = writeRleBitPackedHybrid(writer, [], 0)
}
// definition levels
@ -131,7 +133,8 @@ function writeLevels(writer, schemaPath, values) {
definitionLevels.push(maxDefinitionLevel)
}
}
definition_levels_byte_length = writeRleBitPackedHybrid(writer, definitionLevels)
const bitWidth = Math.ceil(Math.log2(maxDefinitionLevel + 1))
definition_levels_byte_length = writeRleBitPackedHybrid(writer, definitionLevels, bitWidth)
}
return { definition_levels_byte_length, repetition_levels_byte_length, num_nulls }
}

@ -5,16 +5,11 @@ import { ByteWriter } from './bytewriter.js'
* @import {Writer} from '../src/types.js'
* @param {Writer} writer
* @param {DecodedArray} values
* @param {number} bitWidth
* @returns {number} bytes written
*/
export function writeRleBitPackedHybrid(writer, values) {
export function writeRleBitPackedHybrid(writer, values, bitWidth) {
const offsetStart = writer.offset
// find max bitwidth
let max = 0
for (const v of values) {
if (v > max) max = v
}
const bitWidth = Math.ceil(Math.log2(max + 1))
// try both RLE and bit-packed and choose the best
const rle = new ByteWriter()

@ -14,7 +14,7 @@ function roundTripDeserialize(values) {
// Serialize the values using writeRleBitPackedHybrid
const writer = new ByteWriter()
writeRleBitPackedHybrid(writer, values)
writeRleBitPackedHybrid(writer, values, bitWidth)
const buffer = writer.getBuffer()
const reader = { view: new DataView(buffer), offset: 0 }